449 files changed, 21562 insertions, 13761 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e051147679..814ac4e213a 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
 
 	  If unsure, say N.
 
+if 9P_FS
+
 config 9P_FSCACHE
 	bool "Enable 9P client caching support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
 
 config 9P_FS_POSIX_ACL
 	bool "9P POSIX Access Control Lists"
-	depends on 9P_FS
 	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1..ab8c1278063 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 9p-objs := \
 	vfs_super.o \
 	vfs_inode.o \
+	vfs_inode_dotl.o \
 	vfs_addr.o \
 	vfs_file.o \
 	vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6e58c4ca1e6..02a2cf61631 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
 	ssize_t size;
 	void *value = NULL;
-	struct posix_acl *acl = NULL;;
+	struct posix_acl *acl = NULL;
 
 	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
 	if (size > 0) {
@@ -365,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	case ACL_TYPE_DEFAULT:
 		name = POSIX_ACL_XATTR_DEFAULT;
 		if (!S_ISDIR(inode->i_mode)) {
-			retval = -EINVAL;
+			retval = acl ? -EINVAL : 0;
 			goto err_out;
 		}
 		break;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2..c4b5d8864f0 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
 									char *);
-void v9fs_session_close(struct v9fs_session_info *v9ses);
-void v9fs_session_cancel(struct v9fs_session_info *v9ses);
-void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+			struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			void *p);
+extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+			struct p9_fid *fid,
+			struct super_block *sb);
+
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+			struct p9_fid *fid,
+			struct super_block *sb);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
 	return v9ses->flags & V9FS_PROTO_2000L;
 }
+
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+				struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_dotl(v9ses, fid, sb);
+	else
+		return v9fs_inode(v9ses, fid, sb);
+}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bab0eac873f..b789f8e597e 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
-void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 466d2a4fc5c..233b7d4ffe5 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
  *
  */
 
-void v9fs_dentry_release(struct dentry *dentry)
+static void v9fs_dentry_release(struct dentry *dentry)
 {
 	struct v9fs_dentry *dent;
 	struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 59782981b22..b76a40bdf4c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
 
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
-static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
-static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
-static const struct inode_operations v9fs_symlink_inode_operations_dotl;
-
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-		    dev_t rdev);
 
 /**
  * unixmode2p9mode - convert unix mode bits to plan 9
@@ -251,41 +244,6 @@ void v9fs_destroy_inode(struct inode *inode)
 #endif
 
 /**
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
- * new file system object. This checks the S_ISGID to determine the owning
- * group of the new file system object.
- */
-
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
-{
-	BUG_ON(dir_inode == NULL);
-
-	if (dir_inode->i_mode & S_ISGID) {
-		/* set_gid bit is set.*/
-		return dir_inode->i_gid;
-	}
-	return current_fsgid();
-}
-
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
-	struct dentry *dentry;
-
-	spin_lock(&inode->i_lock);
-	/* Directory should have only one entry. */
-	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-	spin_unlock(&inode->i_lock);
-	return dentry;
-}
-
-/**
  * v9fs_get_inode - helper function to setup an inode
  * @sb: superblock
  * @mode: mode to setup inode with
@@ -454,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
 #endif
 }
 
-static struct inode *
+struct inode *
 v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	struct super_block *sb)
 {
@@ -489,60 +447,6 @@ error:
 	return ERR_PTR(err);
 }
 
-static struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-	struct super_block *sb)
-{
-	struct inode *ret = NULL;
-	int err;
-	struct p9_stat_dotl *st;
-
-	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-	if (IS_ERR(st))
-		return ERR_CAST(st);
-
-	ret = v9fs_get_inode(sb, st->st_mode);
-	if (IS_ERR(ret)) {
-		err = PTR_ERR(ret);
-		goto error;
-	}
-
-	v9fs_stat2inode_dotl(st, ret);
-	ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-	v9fs_vcookie_set_qid(ret, &st->qid);
-	v9fs_cache_inode_get_cookie(ret);
-#endif
-	err = v9fs_get_acl(ret, fid);
-	if (err) {
-		iput(ret);
-		goto error;
-	}
-	kfree(st);
-	return ret;
-error:
-	kfree(st);
-	return ERR_PTR(err);
-}
-
-/**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-			struct super_block *sb)
-{
-	if (v9fs_proto_dotl(v9ses))
-		return v9fs_inode_dotl(v9ses, fid, sb);
-	else
-		return v9fs_inode(v9ses, fid, sb);
-}
-
 /**
  * v9fs_remove - helper function to remove files and directories
  * @dir: directory inode that is being deleted
@@ -633,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
-
-	if (v9ses->cache)
-		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-	else
-		d_set_d_op(dentry, &v9fs_dentry_operations);
-
 	d_instantiate(dentry, inode);
 	err = v9fs_fid_add(dentry, fid);
 	if (err < 0)
@@ -657,144 +555,6 @@ error:
 }
 
 /**
- * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
- * @dir: directory inode that is being created
- * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
- *
- */
-
-static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
-		struct nameidata *nd)
-{
-	int err = 0;
-	char *name = NULL;
-	gid_t gid;
-	int flags;
-	mode_t mode;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid = NULL;
-	struct p9_fid *dfid, *ofid;
-	struct file *filp;
-	struct p9_qid qid;
-	struct inode *inode;
-	struct posix_acl *pacl = NULL, *dacl = NULL;
-
-	v9ses = v9fs_inode2v9ses(dir);
-	if (nd && nd->flags & LOOKUP_OPEN)
-		flags = nd->intent.open.flags - 1;
-	else {
-		/*
-		 * create call without LOOKUP_OPEN is due
-		 * to mknod of regular files. So use mknod
-		 * operation.
-		 */
-		return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
-	}
-
-	name = (char *) dentry->d_name.name;
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-			"mode:0x%x\n", name, flags, omode);
-
-	dfid = v9fs_fid_lookup(dentry->d_parent);
-	if (IS_ERR(dfid)) {
-		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-		return err;
-	}
-
-	/* clone a fid to use for creation */
-	ofid = p9_client_walk(dfid, 0, NULL, 1);
-	if (IS_ERR(ofid)) {
-		err = PTR_ERR(ofid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		return err;
-	}
-
-	gid = v9fs_get_fsgid_for_create(dir);
-
-	mode = omode;
-	/* Update mode based on ACL value */
-	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in creat %d\n", err);
-		goto error;
-	}
-	err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
-	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-				"p9_client_open_dotl failed in creat %d\n",
-				err);
-		goto error;
-	}
-	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
-	    (nd && nd->flags & LOOKUP_OPEN)) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
-			fid = NULL;
-			goto error;
-		}
-
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
-		/* The fid would get clunked via a dput */
-		fid = NULL;
-	} else {
-		/*
-		 * Not in cached mode. No need to populate
-		 * inode with stat. We need to get an inode
-		 * so that we can set the acl with dentry
-		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_dentry_operations);
-		d_instantiate(dentry, inode);
-	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
-
-	/* if we are opening a file, assign the open fid to the file */
-	if (nd && nd->flags & LOOKUP_OPEN) {
-		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-		if (IS_ERR(filp)) {
-			p9_client_clunk(ofid);
-			return PTR_ERR(filp);
-		}
-		filp->private_data = ofid;
-	} else
-		p9_client_clunk(ofid);
-
-	return 0;
-
-error:
-	if (ofid)
-		p9_client_clunk(ofid);
-	if (fid)
-		p9_client_clunk(fid);
-	return err;
-}
-
-/**
  * v9fs_vfs_create - VFS hook to create files
  * @dir: directory inode that is being created
  * @dentry:  dentry that is being deleted
@@ -884,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return err;
 }
 
-
-/**
- * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
- * @dir:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
- *
- */
-
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-			       struct dentry *dentry, int omode)
-{
-	int err;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid = NULL, *dfid = NULL;
-	gid_t gid;
-	char *name;
-	mode_t mode;
-	struct inode *inode;
-	struct p9_qid qid;
-	struct dentry *dir_dentry;
-	struct posix_acl *dacl = NULL, *pacl = NULL;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
-	err = 0;
-	v9ses = v9fs_inode2v9ses(dir);
-
-	omode |= S_IFDIR;
-	if (dir->i_mode & S_ISGID)
-		omode |= S_ISGID;
-
-	dir_dentry = v9fs_dentry_from_dir_inode(dir);
-	dfid = v9fs_fid_lookup(dir_dentry);
-	if (IS_ERR(dfid)) {
-		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-		dfid = NULL;
-		goto error;
-	}
-
-	gid = v9fs_get_fsgid_for_create(dir);
-	mode = omode;
-	/* Update mode based on ACL value */
-	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in mkdir %d\n", err);
-		goto error;
-	}
-	name = (char *) dentry->d_name.name;
-	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
-	if (err < 0)
-		goto error;
-
-	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
-			fid = NULL;
-			goto error;
-		}
-
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
-		fid = NULL;
-	} else {
-		/*
-		 * Not in cached mode. No need to populate
-		 * inode with stat. We need to get an inode
-		 * so that we can set the acl with dentry
-		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_dentry_operations);
-		d_instantiate(dentry, inode);
-	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
-
-error:
-	if (fid)
-		p9_client_clunk(fid);
-	return err;
-}
-
 /**
  * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
  * @dir:  inode that is being walked from
@@ -993,7 +652,7 @@ error:
  *
  */
 
-static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 				      struct nameidata *nameidata)
 {
 	struct super_block *sb;
@@ -1040,11 +699,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto error_iput;
 
 inst_out:
-	if (v9ses->cache)
-		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-	else
-		d_set_d_op(dentry, &v9fs_dentry_operations);
-
 	d_add(dentry, inode);
 	return NULL;
 
@@ -1063,7 +717,7 @@ error:
  *
  */
 
-static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 {
 	return v9fs_remove(i, d, 0);
 }
@@ -1075,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
  *
  */
 
-static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
 	return v9fs_remove(i, d, 1);
 }
@@ -1089,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
  *
  */
 
-static int
+int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -1196,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-		 struct kstat *stat)
-{
-	int err;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid;
-	struct p9_stat_dotl *st;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-	err = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		return simple_getattr(mnt, dentry, stat);
-
-	fid = v9fs_fid_lookup(dentry);
-	if (IS_ERR(fid))
-		return PTR_ERR(fid);
-
-	/* Ask for all the fields in stat structure. Server will return
-	 * whatever it supports
-	 */
-
-	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-	if (IS_ERR(st))
-		return PTR_ERR(st);
-
-	v9fs_stat2inode_dotl(st, dentry->d_inode);
-	generic_fillattr(dentry->d_inode, stat);
-	/* Change block size to what the server returned */
-	stat->blksize = st->st_blksize;
-
-	kfree(st);
-	return 0;
-}
-
 /**
  * v9fs_vfs_setattr - set file metadata
  * @dentry: file whose metadata to set
@@ -1291,64 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 
 /**
- * v9fs_vfs_setattr_dotl - set file metadata
- * @dentry: file whose metadata to set
- * @iattr: metadata assignment structure
- *
- */
-
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
-{
-	int retval;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid;
-	struct p9_iattr_dotl p9attr;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
-
-	retval = inode_change_ok(dentry->d_inode, iattr);
-	if (retval)
-		return retval;
-
-	p9attr.valid = iattr->ia_valid;
-	p9attr.mode = iattr->ia_mode;
-	p9attr.uid = iattr->ia_uid;
-	p9attr.gid = iattr->ia_gid;
-	p9attr.size = iattr->ia_size;
-	p9attr.atime_sec = iattr->ia_atime.tv_sec;
-	p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
-	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
-	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-
-	retval = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	fid = v9fs_fid_lookup(dentry);
-	if (IS_ERR(fid))
-		return PTR_ERR(fid);
-
-	retval = p9_client_setattr(fid, &p9attr);
-	if (retval < 0)
-		return retval;
-
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(dentry->d_inode)) {
-		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-		if (retval)
-			return retval;
-	}
-
-	setattr_copy(dentry->d_inode, iattr);
-	mark_inode_dirty(dentry->d_inode);
-	if (iattr->ia_valid & ATTR_MODE) {
-		/* We also want to update ACL when we update mode bits */
-		retval = v9fs_acl_chmod(dentry);
-		if (retval < 0)
-			return retval;
-	}
-	return 0;
-}
-
-/**
  * v9fs_stat2inode - populate an inode structure with mistat info
  * @stat: Plan 9 metadata (mistat) structure
  * @inode: inode to populate
@@ -1426,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 
 /**
- * v9fs_stat2inode_dotl - populate an inode structure with stat info
- * @stat: stat structure
- * @inode: inode to populate
- * @sb: superblock of filesystem
- *
- */
-
-void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
-{
-
-	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-		inode->i_atime.tv_sec = stat->st_atime_sec;
-		inode->i_atime.tv_nsec = stat->st_atime_nsec;
-		inode->i_mtime.tv_sec = stat->st_mtime_sec;
-		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-		inode->i_ctime.tv_sec = stat->st_ctime_sec;
-		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-		inode->i_uid = stat->st_uid;
-		inode->i_gid = stat->st_gid;
-		inode->i_nlink = stat->st_nlink;
-		inode->i_mode = stat->st_mode;
-		inode->i_rdev = new_decode_dev(stat->st_rdev);
-
-		if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-			init_special_inode(inode, inode->i_mode, inode->i_rdev);
-
-		i_size_write(inode, stat->st_size);
-		inode->i_blocks = stat->st_blocks;
-	} else {
-		if (stat->st_result_mask & P9_STATS_ATIME) {
-			inode->i_atime.tv_sec = stat->st_atime_sec;
-			inode->i_atime.tv_nsec = stat->st_atime_nsec;
-		}
-		if (stat->st_result_mask & P9_STATS_MTIME) {
-			inode->i_mtime.tv_sec = stat->st_mtime_sec;
-			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-		}
-		if (stat->st_result_mask & P9_STATS_CTIME) {
-			inode->i_ctime.tv_sec = stat->st_ctime_sec;
-			inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-		}
-		if (stat->st_result_mask & P9_STATS_UID)
-			inode->i_uid = stat->st_uid;
-		if (stat->st_result_mask & P9_STATS_GID)
-			inode->i_gid = stat->st_gid;
-		if (stat->st_result_mask & P9_STATS_NLINK)
-			inode->i_nlink = stat->st_nlink;
-		if (stat->st_result_mask & P9_STATS_MODE) {
-			inode->i_mode = stat->st_mode;
-			if ((S_ISBLK(inode->i_mode)) ||
-						(S_ISCHR(inode->i_mode)))
-				init_special_inode(inode, inode->i_mode,
-								inode->i_rdev);
-		}
-		if (stat->st_result_mask & P9_STATS_RDEV)
-			inode->i_rdev = new_decode_dev(stat->st_rdev);
-		if (stat->st_result_mask & P9_STATS_SIZE)
-			i_size_write(inode, stat->st_size);
-		if (stat->st_result_mask & P9_STATS_BLOCKS)
-			inode->i_blocks = stat->st_blocks;
-	}
-	if (stat->st_result_mask & P9_STATS_GEN)
-			inode->i_generation = stat->st_gen;
-
-	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
-	 * because the inode structure does not have fields for them.
-	 */
-}
-
-/**
  * v9fs_qid2ino - convert qid into inode number
  * @qid: qid to hash
  *
@@ -1602,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
  *
  */
 
-static void
+void
 v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	char *s = nd_get_link(nd);
@@ -1646,94 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 
 /**
- * v9fs_vfs_symlink_dotl - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See Also: 9P2000.L RFC for more information
- *
- */
-
-static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-		const char *symname)
-{
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *dfid;
-	struct p9_fid *fid = NULL;
-	struct inode *inode;
-	struct p9_qid qid;
-	char *name;
-	int err;
-	gid_t gid;
-
-	name = (char *) dentry->d_name.name;
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-			dir->i_ino, name, symname);
-	v9ses = v9fs_inode2v9ses(dir);
-
-	dfid = v9fs_fid_lookup(dentry->d_parent);
-	if (IS_ERR(dfid)) {
-		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-		return err;
-	}
-
-	gid = v9fs_get_fsgid_for_create(dir);
-
-	/* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
-	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
-
-	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
-		goto error;
-	}
-
-	if (v9ses->cache) {
-		/* Now walk from the parent so we can get an unopened fid. */
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-					err);
-			fid = NULL;
-			goto error;
-		}
-
-		/* instantiate inode and assign the unopened fid to dentry */
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-					err);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
-		fid = NULL;
-	} else {
-		/* Not in cached mode. No need to populate inode with stat */
-		inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_dentry_operations);
-		d_instantiate(dentry, inode);
-	}
-
-error:
-	if (fid)
-		p9_client_clunk(fid);
-
-	return err;
-}
-
-/**
  * v9fs_vfs_symlink - helper function to create symlinks
  * @dir: directory inode containing symlink
  * @dentry: dentry for symlink
@@ -1792,77 +1193,6 @@ clunk_fid:
 }
 
 /**
- * v9fs_vfs_link_dotl - create a hardlink for dotl
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-
-static int
-v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
-		struct dentry *dentry)
-{
-	int err;
-	struct p9_fid *dfid, *oldfid;
-	char *name;
-	struct v9fs_session_info *v9ses;
-	struct dentry *dir_dentry;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-			dir->i_ino, old_dentry->d_name.name,
-			dentry->d_name.name);
-
-	v9ses = v9fs_inode2v9ses(dir);
-	dir_dentry = v9fs_dentry_from_dir_inode(dir);
-	dfid = v9fs_fid_lookup(dir_dentry);
-	if (IS_ERR(dfid))
-		return PTR_ERR(dfid);
-
-	oldfid = v9fs_fid_lookup(old_dentry);
-	if (IS_ERR(oldfid))
-		return PTR_ERR(oldfid);
-
-	name = (char *) dentry->d_name.name;
-
-	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
-
-	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
-		return err;
-	}
-
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		/* Get the latest stat info from server. */
-		struct p9_fid *fid;
-		struct p9_stat_dotl *st;
-
-		fid = v9fs_fid_lookup(old_dentry);
-		if (IS_ERR(fid))
-			return PTR_ERR(fid);
-
-		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-		if (IS_ERR(st))
-			return PTR_ERR(st);
-
-		v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-
-		kfree(st);
-	} else {
-		/* Caching disabled. No need to get upto date stat info.
-		 * This dentry will be released immediately. So, just hold the
-		 * inode
-		 */
-		ihold(old_dentry->d_inode);
-	}
-
-	d_set_d_op(dentry, old_dentry->d_op);
-	d_instantiate(dentry, old_dentry->d_inode);
-
-	return err;
-}
-
-/**
  * v9fs_vfs_mknod - create a special file
  * @dir: inode destination for new link
  * @dentry: dentry for file
@@ -1907,160 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return retval;
 }
 
-/**
- * v9fs_vfs_mknod_dotl - create a special file
- * @dir: inode destination for new link
- * @dentry: dentry for file
- * @mode: mode for creation
- * @rdev: device associated with special file
- *
- */
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-		dev_t rdev)
-{
-	int err;
-	char *name;
-	mode_t mode;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid = NULL, *dfid = NULL;
-	struct inode *inode;
-	gid_t gid;
-	struct p9_qid qid;
-	struct dentry *dir_dentry;
-	struct posix_acl *dacl = NULL, *pacl = NULL;
-
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
-
-	if (!new_valid_dev(rdev))
-		return -EINVAL;
-
-	v9ses = v9fs_inode2v9ses(dir);
-	dir_dentry = v9fs_dentry_from_dir_inode(dir);
-	dfid = v9fs_fid_lookup(dir_dentry);
-	if (IS_ERR(dfid)) {
-		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-		dfid = NULL;
-		goto error;
-	}
-
-	gid = v9fs_get_fsgid_for_create(dir);
-	mode = omode;
-	/* Update mode based on ACL value */
-	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in mknod %d\n", err);
-		goto error;
-	}
-	name = (char *) dentry->d_name.name;
-
-	err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
-	if (err < 0)
-		goto error;
-
-	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
-			fid = NULL;
-			goto error;
-		}
-
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
-		fid = NULL;
-	} else {
-		/*
-		 * Not in cached mode. No need to populate inode with stat.
-		 * socket syscall returns a fd, so we need instantiate
-		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		d_set_d_op(dentry, &v9fs_dentry_operations);
-		d_instantiate(dentry, inode);
-	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-	if (fid)
-		p9_client_clunk(fid);
-	return err;
-}
-
-static int
-v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
-{
-	int retval;
-	struct p9_fid *fid;
-	char *target = NULL;
-
-	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
-	retval = -EPERM;
-	fid = v9fs_fid_lookup(dentry);
-	if (IS_ERR(fid))
-		return PTR_ERR(fid);
-
-	retval = p9_client_readlink(fid, &target);
-	if (retval < 0)
-		return retval;
-
-	strncpy(buffer, target, buflen);
-	P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
-
-	retval = strnlen(buffer, buflen);
-	return retval;
-}
-
-/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
-{
-	int len = 0;
-	char *link = __getname();
-
-	P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
-
-	if (!link)
-		link = ERR_PTR(-ENOMEM);
-	else {
-		len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
-		if (len < 0) {
-			__putname(link);
-			link = ERR_PTR(len);
-		} else
-			link[min(len, PATH_MAX-1)] = 0;
-	}
-	nd_set_link(nd, link);
-
-	return NULL;
-}
-
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
@@ -2075,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 	.setattr = v9fs_vfs_setattr,
 };
 
-static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-	.create = v9fs_vfs_create_dotl,
-	.lookup = v9fs_vfs_lookup,
-	.link = v9fs_vfs_link_dotl,
-	.symlink = v9fs_vfs_symlink_dotl,
-	.unlink = v9fs_vfs_unlink,
-	.mkdir = v9fs_vfs_mkdir_dotl,
-	.rmdir = v9fs_vfs_rmdir,
-	.mknod = v9fs_vfs_mknod_dotl,
-	.rename = v9fs_vfs_rename,
-	.getattr = v9fs_vfs_getattr_dotl,
-	.setattr = v9fs_vfs_setattr_dotl,
-	.setxattr = generic_setxattr,
-	.getxattr = generic_getxattr,
-	.removexattr = generic_removexattr,
-	.listxattr = v9fs_listxattr,
-	.check_acl = v9fs_check_acl,
-};
-
 static const struct inode_operations v9fs_dir_inode_operations = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
@@ -2111,16 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
 	.setattr = v9fs_vfs_setattr,
 };
 
-static const struct inode_operations v9fs_file_inode_operations_dotl = {
-	.getattr = v9fs_vfs_getattr_dotl,
-	.setattr = v9fs_vfs_setattr_dotl,
-	.setxattr = generic_setxattr,
-	.getxattr = generic_getxattr,
-	.removexattr = generic_removexattr,
-	.listxattr = v9fs_listxattr,
-	.check_acl = v9fs_check_acl,
-};
-
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
 	.follow_link = v9fs_vfs_follow_link,
@@ -2129,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 	.setattr = v9fs_vfs_setattr,
 };
 
-static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-	.readlink = v9fs_vfs_readlink_dotl,
-	.follow_link = v9fs_vfs_follow_link_dotl,
-	.put_link = v9fs_vfs_put_link,
-	.getattr = v9fs_vfs_getattr_dotl,
-	.setattr = v9fs_vfs_setattr_dotl,
-	.setxattr = generic_setxattr,
-	.getxattr = generic_getxattr,
-	.removexattr = generic_removexattr,
-	.listxattr = v9fs_listxattr,
-};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 00000000000..fe3ffa9aace
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		    dev_t rdev);
+
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+	BUG_ON(dir_inode == NULL);
+
+	if (dir_inode->i_mode & S_ISGID) {
+		/* set_gid bit is set.*/
+		return dir_inode->i_gid;
+	}
+	return current_fsgid();
+}
+
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	spin_lock(&inode->i_lock);
+	/* Directory should have only one entry. */
+	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+	spin_unlock(&inode->i_lock);
+	return dentry;
+}
+
+struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+	struct super_block *sb)
+{
+	struct inode *ret = NULL;
+	int err;
+	struct p9_stat_dotl *st;
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	ret = v9fs_get_inode(sb, st->st_mode);
+	if (IS_ERR(ret)) {
+		err = PTR_ERR(ret);
+		goto error;
+	}
+
+	v9fs_stat2inode_dotl(st, ret);
+	ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_vcookie_set_qid(ret, &st->qid);
+	v9fs_cache_inode_get_cookie(ret);
+#endif
+	err = v9fs_get_acl(ret, fid);
+	if (err) {
+		iput(ret);
+		goto error;
+	}
+	kfree(st);
+	return ret;
+error:
+	kfree(st);
+	return ERR_PTR(err);
+}
+
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		struct nameidata *nd)
+{
+	int err = 0;
+	char *name = NULL;
+	gid_t gid;
+	int flags;
+	mode_t mode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL;
+	struct p9_fid *dfid, *ofid;
+	struct file *filp;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct posix_acl *pacl = NULL, *dacl = NULL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	if (nd && nd->flags & LOOKUP_OPEN)
+		flags = nd->intent.open.flags - 1;
+	else {
+		/*
+		 * create call without LOOKUP_OPEN is due
+		 * to mknod of regular files. So use mknod
+		 * operation.
+		 */
+		return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	}
+
+	name = (char *) dentry->d_name.name;
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+			"mode:0x%x\n", name, flags, omode);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	/* clone a fid to use for creation */
+	ofid = p9_client_walk(dfid, 0, NULL, 1);
+	if (IS_ERR(ofid)) {
+		err = PTR_ERR(ofid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in creat %d\n", err);
+		goto error;
+	}
+	err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+				"p9_client_open_dotl failed in creat %d\n",
+				err);
+		goto error;
+	}
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		fid = NULL;
+		goto error;
+	}
+	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		goto error;
+	}
+	d_instantiate(dentry, inode);
+	err = v9fs_fid_add(dentry, fid);
+	if (err < 0)
+		goto error;
+
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+
+	/* Since we are opening a file, assign the open fid to the file */
+	filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+	if (IS_ERR(filp)) {
+		p9_client_clunk(ofid);
+		return PTR_ERR(filp);
+	}
+	filp->private_data = ofid;
+	return 0;
+
+error:
+	if (ofid)
+		p9_client_clunk(ofid);
+	if (fid)
+		p9_client_clunk(fid);
+	return err;
+}
+
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+			       struct dentry *dentry, int omode)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	gid_t gid;
+	char *name;
+	mode_t mode;
+	struct inode *inode;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	err = 0;
+	v9ses = v9fs_inode2v9ses(dir);
+
+	omode |= S_IFDIR;
+	if (dir->i_mode & S_ISGID)
+		omode |= S_ISGID;
+
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mkdir %d\n", err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	return err;
+}
+
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_stat_dotl *st;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	err = -EPERM;
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		return simple_getattr(mnt, dentry, stat);
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/* Ask for all the fields in stat structure. Server will return
+	 * whatever it supports
+	 */
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	v9fs_stat2inode_dotl(st, dentry->d_inode);
+	generic_fillattr(dentry->d_inode, stat);
+	/* Change block size to what the server returned */
+	stat->blksize = st->st_blksize;
+
+	kfree(st);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+	int retval;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+	retval = inode_change_ok(dentry->d_inode, iattr);
+	if (retval)
+		return retval;
+
+	p9attr.valid = iattr->ia_valid;
+	p9attr.mode = iattr->ia_mode;
+	p9attr.uid = iattr->ia_uid;
+	p9attr.gid = iattr->ia_gid;
+	p9attr.size = iattr->ia_size;
+	p9attr.atime_sec = iattr->ia_atime.tv_sec;
+	p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+
+	retval = -EPERM;
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	retval = p9_client_setattr(fid, &p9attr);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode)) {
+		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+		if (retval)
+			return retval;
+	}
+
+	setattr_copy(dentry->d_inode, iattr);
+	mark_inode_dirty(dentry->d_inode);
+	if (iattr->ia_valid & ATTR_MODE) {
+		/* We also want to update ACL when we update mode bits */
+		retval = v9fs_acl_chmod(dentry);
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
+}
+
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+
+	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+		inode->i_atime.tv_sec = stat->st_atime_sec;
+		inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		inode->i_mtime.tv_sec = stat->st_mtime_sec;
+		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		inode->i_ctime.tv_sec = stat->st_ctime_sec;
+		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		inode->i_uid = stat->st_uid;
+		inode->i_gid = stat->st_gid;
+		inode->i_nlink = stat->st_nlink;
+		inode->i_mode = stat->st_mode;
+		inode->i_rdev = new_decode_dev(stat->st_rdev);
+
+		if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+			init_special_inode(inode, inode->i_mode, inode->i_rdev);
+
+		i_size_write(inode, stat->st_size);
+		inode->i_blocks = stat->st_blocks;
+	} else {
+		if (stat->st_result_mask & P9_STATS_ATIME) {
+			inode->i_atime.tv_sec = stat->st_atime_sec;
+			inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_MTIME) {
+			inode->i_mtime.tv_sec = stat->st_mtime_sec;
+			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_CTIME) {
+			inode->i_ctime.tv_sec = stat->st_ctime_sec;
+			inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_UID)
+			inode->i_uid = stat->st_uid;
+		if (stat->st_result_mask & P9_STATS_GID)
+			inode->i_gid = stat->st_gid;
+		if (stat->st_result_mask & P9_STATS_NLINK)
+			inode->i_nlink = stat->st_nlink;
+		if (stat->st_result_mask & P9_STATS_MODE) {
+			inode->i_mode = stat->st_mode;
+			if ((S_ISBLK(inode->i_mode)) ||
+						(S_ISCHR(inode->i_mode)))
+				init_special_inode(inode, inode->i_mode,
+								inode->i_rdev);
+		}
+		if (stat->st_result_mask & P9_STATS_RDEV)
+			inode->i_rdev = new_decode_dev(stat->st_rdev);
+		if (stat->st_result_mask & P9_STATS_SIZE)
+			i_size_write(inode, stat->st_size);
+		if (stat->st_result_mask & P9_STATS_BLOCKS)
+			inode->i_blocks = stat->st_blocks;
+	}
+	if (stat->st_result_mask & P9_STATS_GEN)
+			inode->i_generation = stat->st_gen;
+
+	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+	 * because the inode structure does not have fields for them.
+	 */
+}
+
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *dfid;
+	struct p9_fid *fid = NULL;
+	struct inode *inode;
+	struct p9_qid qid;
+	char *name;
+	int err;
+	gid_t gid;
+
+	name = (char *) dentry->d_name.name;
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+			dir->i_ino, name, symname);
+	v9ses = v9fs_inode2v9ses(dir);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	/* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+		goto error;
+	}
+
+	if (v9ses->cache) {
+		/* Now walk from the parent so we can get an unopened fid. */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+					err);
+			fid = NULL;
+			goto error;
+		}
+
+		/* instantiate inode and assign the unopened fid to dentry */
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+					err);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		fid = NULL;
+	} else {
+		/* Not in cached mode. No need to populate inode with stat */
+		inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	int err;
+	struct p9_fid *dfid, *oldfid;
+	char *name;
+	struct v9fs_session_info *v9ses;
+	struct dentry *dir_dentry;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+			dir->i_ino, old_dentry->d_name.name,
+			dentry->d_name.name);
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid))
+		return PTR_ERR(dfid);
+
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+		return err;
+	}
+
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		/* Get the latest stat info from server. */
+		struct p9_fid *fid;
+		struct p9_stat_dotl *st;
+
+		fid = v9fs_fid_lookup(old_dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+		if (IS_ERR(st))
+			return PTR_ERR(st);
+
+		v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+
+		kfree(st);
+	} else {
+		/* Caching disabled. No need to get upto date stat info.
+		 * This dentry will be released immediately. So, just hold the
+		 * inode
+		 */
+		ihold(old_dentry->d_inode);
+	}
+	d_instantiate(dentry, old_dentry->d_inode);
+
+	return err;
+}
+
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		dev_t rdev)
+{
+	int err;
+	char *name;
+	mode_t mode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	struct inode *inode;
+	gid_t gid;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS,
+		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mknod %d\n", err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate inode with stat.
+		 * socket syscall returns a fd, so we need instantiate
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	return err;
+}
+
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+	int retval;
+	struct p9_fid *fid;
+	char *link = __getname();
+	char *target;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+		goto ndset;
+	}
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		__putname(link);
+		link = ERR_PTR(PTR_ERR(fid));
+		goto ndset;
+	}
+	retval = p9_client_readlink(fid, &target);
+	if (!retval) {
+		strcpy(link, target);
+		kfree(target);
+		goto ndset;
+	}
+	__putname(link);
+	link = ERR_PTR(retval);
+ndset:
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+	.create = v9fs_vfs_create_dotl,
+	.lookup = v9fs_vfs_lookup,
+	.link = v9fs_vfs_link_dotl,
+	.symlink = v9fs_vfs_symlink_dotl,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir_dotl,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod_dotl,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
+};
+
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
+};
+
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link_dotl,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c55c614500a..dbaabe3b813 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	}
 	v9fs_fill_super(sb, v9ses, flags, data);
 
+	if (v9ses->cache)
+		sb->s_d_op = &v9fs_cached_dentry_operations;
+	else
+		sb->s_d_op = &v9fs_dentry_operations;
+
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
@@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
 
 	P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
 
-	if (s->s_root)
-		v9fs_dentry_release(s->s_root);	/* clunk root */
-
 	kill_anon_super(s);
 
 	v9fs_session_cancel(v9ses);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df8433..d288773871b 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 			"p9_client_xattrcreate failed %d\n", retval);
 		goto error;
 	}
-	msize = fid->clnt->msize;;
+	msize = fid->clnt->msize;
 	while (value_len) {
 		if (value_len > (msize - P9_IOHDRSZ))
 			write_count = msize - P9_IOHDRSZ;
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d..3db9caa57ed 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
 
-config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
-#
-# NOTE: you can implement Posix ACLs without these helpers (XFS does).
-# 	Never use this symbol for ifdefs.
-#
-	bool
-	default n
-
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+	def_bool n
+
 config EXPORTFS
 	tristate
 
 config FILE_LOCKING
-	bool "Enable POSIX file locking API" if EMBEDDED
+	bool "Enable POSIX file locking API" if EXPERT
 	default y
 	help
 	  This option enables standard file locking support, required
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index bf7693c384f..3b4a764ed78 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -276,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	struct object_info obj;
 	int error;
 
-	d_set_d_op(dentry, &adfs_dentry_operations);
 	lock_kernel();
 	error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
 	if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a4041b52fbc..2d7954049fb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -473,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		asb->s_namelen = ADFS_F_NAME_LEN;
 	}
 
+	sb->s_d_op = &adfs_dentry_operations;
 	root = adfs_iget(sb, &root_obj);
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
@@ -483,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		kfree(asb->s_map);
 		adfs_error(sb, "get root inode failed\n");
 		goto error;
-	} else
-		d_set_d_op(sb->s_root, &adfs_dentry_operations);
+	}
 	unlock_kernel();
 	return 0;
 
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb3402..0e95f73a702 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations	 affs_aops_ofs;
 
 extern const struct dentry_operations	 affs_dentry_operations;
+extern const struct dentry_operations	 affs_intl_dentry_operations;
 
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 944a4042fb6..e3e9efc1fdd 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -32,7 +32,7 @@ const struct dentry_operations affs_dentry_operations = {
 	.d_compare	= affs_compare_dentry,
 };
 
-static const struct dentry_operations affs_intl_dentry_operations = {
+const struct dentry_operations affs_intl_dentry_operations = {
 	.d_hash		= affs_intl_hash_dentry,
 	.d_compare	= affs_intl_compare_dentry,
 };
@@ -240,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
-	d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
 	d_add(dentry, inode);
 	return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d39081bbe7c..b31507d0f9b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -477,12 +477,16 @@ got_root:
 		goto out_error_noinode;
 	}
 
+	if (AFFS_SB(sb)->s_flags & SF_INTL)
+		sb->s_d_op = &affs_intl_dentry_operations;
+	else
+		sb->s_d_op = &affs_dentry_operations;
+
 	sb->s_root = d_alloc_root(root_inode);
 	if (!sb->s_root) {
 		printk(KERN_ERR "AFFS: Get root inode failed\n");
 		goto out_error;
 	}
-	d_set_d_op(sb->s_root, &affs_dentry_operations);
 
 	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
 	return 0;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54..1c8c6cc6de3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 	call->server = server;
 
 	INIT_WORK(&call->work, SRXAFSCB_CallBack);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 	call->server = server;
 
 	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
 	call->server = server;
 
 	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_Probe);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
 
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
-	schedule_work(&call->work);
+	queue_work(afs_wq, &call->work);
 	return 0;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34a3263d60a..20c106f2492 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
 	.setattr	= afs_setattr,
 };
 
-static const struct dentry_operations afs_fs_dentry_operations = {
+const struct dentry_operations afs_fs_dentry_operations = {
 	.d_revalidate	= afs_d_revalidate,
 	.d_delete	= afs_d_delete,
 	.d_release	= afs_d_release,
+	.d_automount	= afs_d_automount,
 };
 
 #define AFS_DIR_HASHTBL_SIZE	128
@@ -582,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 success:
-	d_set_d_op(dentry, &afs_fs_dentry_operations);
-
 	d_add(dentry, inode);
 	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
 	       fid.vnode,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c..db66c520147 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
 	inode->i_generation	= 0;
 
 	set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
-	inode->i_flags |= S_NOATIME;
+	set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+	inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
 	unlock_new_inode(inode);
 	_leave(" = %p", inode);
 	return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6d4bc1c8ff6..5a9b6843bac 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
  * dir.c
  */
 extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
 extern const struct file_operations afs_dir_file_operations;
 
 /*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
 /*
  * main.c
  */
+extern struct workqueue_struct *afs_wq;
 extern struct afs_uuid afs_uuid;
 
 /*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
 
+extern struct vfsmount *afs_d_automount(struct path *);
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
 
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b2..42dd2e499ed 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
 struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
 
 /*
  * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
 	if (ret < 0)
 		return ret;
 
+	/* create workqueue */
+	ret = -ENOMEM;
+	afs_wq = alloc_workqueue("afs", 0, 0);
+	if (!afs_wq)
+		return ret;
+
 	/* register the /proc stuff */
 	ret = afs_proc_init();
 	if (ret < 0)
-		return ret;
+		goto error_proc;
 
 #ifdef CONFIG_AFS_FSCACHE
 	/* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
 error_cache:
 #endif
 	afs_proc_cleanup();
+error_proc:
+	destroy_workqueue(afs_wq);
 	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
 	afs_purge_servers();
 	afs_callback_update_kill();
 	afs_vlocation_purge();
-	flush_scheduled_work();
+	destroy_workqueue(afs_wq);
 	afs_cell_purge();
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf5..aa59184151d 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
 				       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
 const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
 
 const struct inode_operations afs_mntpt_inode_operations = {
 	.lookup		= afs_mntpt_lookup,
-	.follow_link	= afs_mntpt_follow_link,
 	.readlink	= page_readlink,
 	.getattr	= afs_getattr,
 };
 
 const struct inode_operations afs_autocell_inode_operations = {
-	.follow_link	= afs_mntpt_follow_link,
 	.getattr	= afs_getattr,
 };
 
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 		_debug("symlink is a mountpoint");
 		spin_lock(&vnode->lock);
 		set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+		vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
 		spin_unlock(&vnode->lock);
 	}
 
@@ -238,52 +236,24 @@ error_no_devname:
 }
 
 /*
- * follow a link from a mountpoint directory, thus causing it to be mounted
+ * handle an automount point
  */
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *afs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
-	int err;
 
-	_enter("%p{%s},{%s:%p{%s},}",
-	       dentry,
-	       dentry->d_name.name,
-	       nd->path.mnt->mnt_devname,
-	       dentry,
-	       nd->path.dentry->d_name.name);
-
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
+	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
 
-	newmnt = afs_mntpt_do_automount(nd->path.dentry);
-	if (IS_ERR(newmnt)) {
-		path_put(&nd->path);
-		return (void *)newmnt;
-	}
-
-	mntget(newmnt);
-	err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
-	switch (err) {
-	case 0:
-		path_put(&nd->path);
-		nd->path.mnt = newmnt;
-		nd->path.dentry = dget(newmnt->mnt_root);
-		schedule_delayed_work(&afs_mntpt_expiry_timer,
-				      afs_mntpt_expiry_timeout * HZ);
-		break;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
-	default:
-		mntput(newmnt);
-		break;
-	}
+	newmnt = afs_mntpt_do_automount(path->dentry);
+	if (IS_ERR(newmnt))
+		return newmnt;
 
-	_leave(" = %d", err);
-	return ERR_PTR(err);
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &afs_vfsmounts);
+	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+			   afs_mntpt_expiry_timeout * HZ);
+	_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+	return newmnt;
 }
 
 /*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
 
 	if (!list_empty(&afs_vfsmounts)) {
 		mark_mounts_for_expiry(&afs_vfsmounts);
-		schedule_delayed_work(&afs_mntpt_expiry_timer,
-				      afs_mntpt_expiry_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+				   afs_mntpt_expiry_timeout * HZ);
 	}
 
 	_leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
 	_enter("");
 
 	ASSERT(list_empty(&afs_vfsmounts));
-	cancel_delayed_work(&afs_mntpt_expiry_timer);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01..e45a323aebb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
 	if (!call) {
 		/* its an incoming call for our callback service */
 		skb_queue_tail(&afs_incoming_calls, skb);
-		schedule_work(&afs_collect_incoming_call_work);
+		queue_work(afs_wq, &afs_collect_incoming_call_work);
 	} else {
 		/* route the messages directly to the appropriate call */
 		skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7b..d59b7516e94 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
 	if (atomic_read(&server->usage) == 0) {
 		list_move_tail(&server->grave, &afs_server_graveyard);
 		server->time_of_death = get_seconds();
-		schedule_delayed_work(&afs_server_reaper,
-				      afs_server_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_server_reaper,
+				   afs_server_timeout * HZ);
 	}
 	spin_unlock(&afs_server_graveyard_lock);
 	_leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
 		expiry = server->time_of_death + afs_server_timeout;
 		if (expiry > now) {
 			delay = (expiry - now) * HZ;
-			if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+			if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+						delay)) {
 				cancel_delayed_work(&afs_server_reaper);
-				schedule_delayed_work(&afs_server_reaper,
-						      delay);
+				queue_delayed_work(afs_wq, &afs_server_reaper,
+						   delay);
 			}
 			break;
 		}
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
 {
 	afs_server_timeout = 0;
 	cancel_delayed_work(&afs_server_reaper);
-	schedule_delayed_work(&afs_server_reaper, 0);
+	queue_delayed_work(afs_wq, &afs_server_reaper, 0);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f901a9d7c11..fb240e8766d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	if (!root)
 		goto error;
 
+	sb->s_d_op = &afs_fs_dentry_operations;
 	sb->s_root = root;
 
 	_leave(" = 0");
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361..431984d2e37 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
 		_debug("buried");
 		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
 		vl->time_of_death = get_seconds();
-		schedule_delayed_work(&afs_vlocation_reap,
-				      afs_vlocation_timeout * HZ);
+		queue_delayed_work(afs_wq, &afs_vlocation_reap,
+				   afs_vlocation_timeout * HZ);
 
 		/* suspend updates on this record */
 		if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
 		if (expiry > now) {
 			delay = (expiry - now) * HZ;
 			_debug("delay %lu", delay);
-			if (!schedule_delayed_work(&afs_vlocation_reap,
-						   delay)) {
+			if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
+						delay)) {
 				cancel_delayed_work(&afs_vlocation_reap);
-				schedule_delayed_work(&afs_vlocation_reap,
-						      delay);
+				queue_delayed_work(afs_wq, &afs_vlocation_reap,
+						   delay);
 			}
 			break;
 		}
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
 	destroy_workqueue(afs_vlocation_update_worker);
 
 	cancel_delayed_work(&afs_vlocation_reap);
-	schedule_delayed_work(&afs_vlocation_reap, 0);
+	queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
 }
 
 /*
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d7..fc557a3be0a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
 
 	aio_wq = create_workqueue("aio");
 	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-	BUG_ON(!abe_pool);
+	BUG_ON(!aio_wq || !abe_pool);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
 	queue_delayed_work(aio_wq, &ctx->wq, timeout);
 }
 
-
-/*
- * aio_run_iocbs:
- * 	Process all pending retries queued on the ioctx
- * 	run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static inline void aio_run_iocbs(struct kioctx *ctx)
-{
-	int requeue;
-
-	spin_lock_irq(&ctx->ctx_lock);
-
-	requeue = __aio_run_iocbs(ctx);
-	spin_unlock_irq(&ctx->ctx_lock);
-	if (requeue)
-		aio_queue_work(ctx);
-}
-
 /*
- * just like aio_run_iocbs, but keeps running them until
- * the list stays empty
+ * aio_run_all_iocbs:
+ *	Process all pending retries queued on the ioctx
+ *	run list, and keep running them until the list
+ *	stays empty.
+ * Assumes it is operating within the aio issuer's mm context.
  */
 static inline void aio_run_all_iocbs(struct kioctx *ctx)
 {
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	long ret = -EINVAL;
 
 	if (likely(ioctx)) {
-		if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
+		if (likely(min_nr <= nr && min_nr >= 0))
 			ret = read_events(ioctx, min_nr, nr, events, timeout);
 		put_ioctx(ioctx);
 	}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 5fd38112a6c..c5567cb7843 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
 
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-}
-
 /*
  * anon_inodefs_dname() is called from d_path().
  */
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 				dentry->d_name.name);
 }
 
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+	.d_dname	= anon_inodefs_dname,
+};
+
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "anon_inode:", NULL,
+			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
+
 static struct file_system_type anon_inode_fs_type = {
 	.name		= "anon_inodefs",
 	.mount		= anon_inodefs_mount,
 	.kill_sb	= kill_anon_super,
 };
-static const struct dentry_operations anon_inodefs_dentry_operations = {
-	.d_dname	= anon_inodefs_dname,
-};
 
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = {
 };
 
 /**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
- *                    anonymous inode, and a dentry that describe the "class"
- *                    of the file
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
+ *                      anonymous inode, and a dentry that describe the "class"
+ *                      of the file
  *
  * @name:    [in]    name of the "class" of the new file
  * @fops:    [in]    file operations for the new file
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
 	 */
 	ihold(anon_inode_inode);
 
-	d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
 	d_instantiate(path.dentry, anon_inode_inode);
 
 	error = -ENFILE;
@@ -232,7 +233,7 @@ static int __init anon_inode_init(void)
 	return 0;
 
 err_mntput:
-	mntput_long(anon_inode_mnt);
+	mntput(anon_inode_mnt);
 err_unregister_filesystem:
 	unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24ce..54f92379272 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -88,18 +88,9 @@ struct autofs_info {
 
 	uid_t uid;
 	gid_t gid;
-
-	mode_t	mode;
-	size_t	size;
-
-	void (*free)(struct autofs_info *);
-	union {
-		const char *symlink;
-	} u;
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
-#define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
 
 struct autofs_wait_queue {
@@ -176,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 	return 0;
 }
 
-static inline void autofs4_copy_atime(struct file *src, struct file *dst)
-{
-	dst->f_path.dentry->d_inode->i_atime =
-		src->f_path.dentry->d_inode->i_atime;
-	return;
-}
-
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
 void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
@@ -212,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
 
 extern const struct inode_operations autofs4_symlink_inode_operations;
 extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct inode_operations autofs4_root_inode_operations;
-extern const struct inode_operations autofs4_indirect_root_inode_operations;
-extern const struct inode_operations autofs4_direct_root_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+
+/* VFS automount flags management functions */
+
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_automount(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_automount(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_transit(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+	dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_transit(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+	dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_set_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+	dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__managed_dentry_clear_managed(dentry);
+	spin_unlock(&dentry->d_lock);
+}
 
 /* Initializing function */
 
 int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
 
 /* Queue management functions */
 
@@ -229,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct path *path)
-{
-	int res = 0;
-
-	while (d_mountpoint(path->dentry)) {
-		int followed = follow_down(path);
-		if (!followed)
-			break;
-		res = 1;
-	}
-	return res;
-}
-
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
 	return new_encode_dev(sbi->sb->s_dev);
@@ -294,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
 	return;
 }
 
-void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469..1442da4860e 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		err = have_submounts(path.dentry);
 
-		if (follow_down(&path))
+		if (follow_down_one(&path))
 			magic = path.mnt->mnt_sb->s_magic;
 	}
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d0136590..f43100b9662 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 	if (ino == NULL)
 		return 0;
 
-	/* No point expiring a pending mount */
-	if (ino->flags & AUTOFS_INF_PENDING)
-		return 0;
-
 	if (!do_now) {
 		/* Too young to die */
 		if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 
 	path_get(&path);
 
-	if (!follow_down(&path))
+	if (!follow_down_one(&path))
 		goto done;
 
 	if (is_autofs4_dentry(path.dentry)) {
@@ -100,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
 	struct dentry *p, *ret;
 
 	if (prev == NULL)
-		return dget(prev);
+		return dget(root);
 
 	spin_lock(&autofs4_lock);
 relock:
@@ -137,7 +133,7 @@ again:
 	spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
 	/* Negative dentry - try next */
 	if (!simple_positive(ret)) {
-		spin_unlock(&ret->d_lock);
+		spin_unlock(&p->d_lock);
 		p = ret;
 		goto again;
 	}
@@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	unsigned long timeout;
 	struct dentry *root = dget(sb->s_root);
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
+	struct autofs_info *ino;
 
 	if (!root)
 		return NULL;
@@ -291,19 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	timeout = sbi->exp_timeout;
 
 	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(root);
+	/* No point expiring a pending mount */
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		spin_unlock(&sbi->fs_lock);
+		return NULL;
+	}
+	managed_dentry_set_transit(root);
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
-		if (d_mountpoint(root)) {
-			ino->flags |= AUTOFS_INF_MOUNTPOINT;
-			spin_lock(&root->d_lock);
-			root->d_flags &= ~DCACHE_MOUNTED;
-			spin_unlock(&root->d_lock);
-		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
 		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
 	}
+	managed_dentry_clear_transit(root);
 	spin_unlock(&sbi->fs_lock);
 	dput(root);
 
@@ -340,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	while ((dentry = get_next_positive_dentry(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
+		/* No point expiring a pending mount */
+		if (ino->flags & AUTOFS_INF_PENDING)
+			goto cont;
+		managed_dentry_set_transit(dentry);
 
 		/*
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -399,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			}
 		}
 next:
+		managed_dentry_clear_transit(dentry);
+cont:
 		spin_unlock(&sbi->fs_lock);
 	}
 	return NULL;
@@ -479,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	if (!d_unhashed(dentry))
+		managed_dentry_clear_transit(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -504,18 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
 
 		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-			spin_lock(&sb->s_root->d_lock);
-			/*
-			 * If we haven't been expired away, then reset
-			 * mounted status.
-			 */
-			if (mnt->mnt_parent != mnt)
-				sb->s_root->d_flags |= DCACHE_MOUNTED;
-			spin_unlock(&sb->s_root->d_lock);
-			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		spin_lock(&dentry->d_lock);
+		if (ret)
+			__managed_dentry_clear_transit(dentry);
+		else {
+			if ((IS_ROOT(dentry) ||
+			    (autofs_type_indirect(sbi->type) &&
+			     IS_ROOT(dentry->d_parent))) &&
+			    !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+				__managed_dentry_set_automount(dentry);
+		}
+		spin_unlock(&dentry->d_lock);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a7bdb9dcac8..180fa2425e4 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
 #include "autofs_i.h"
 #include <linux/module.h>
 
-static void ino_lnkfree(struct autofs_info *ino)
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-	if (ino->u.symlink) {
-		kfree(ino->u.symlink);
-		ino->u.symlink = NULL;
-	}
-}
-
-struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
-				     struct autofs_sb_info *sbi, mode_t mode)
-{
-	int reinit = 1;
-
-	if (ino == NULL) {
-		reinit = 0;
-		ino = kmalloc(sizeof(*ino), GFP_KERNEL);
-	}
-
-	if (ino == NULL)
-		return NULL;
-
-	if (!reinit) {
-		ino->flags = 0;
-		ino->inode = NULL;
-		ino->dentry = NULL;
-		ino->size = 0;
+	struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+	if (ino) {
 		INIT_LIST_HEAD(&ino->active);
-		ino->active_count = 0;
 		INIT_LIST_HEAD(&ino->expiring);
-		atomic_set(&ino->count, 0);
+		ino->last_used = jiffies;
+		ino->sbi = sbi;
 	}
+	return ino;
+}
 
+void autofs4_clean_ino(struct autofs_info *ino)
+{
 	ino->uid = 0;
 	ino->gid = 0;
-	ino->mode = mode;
 	ino->last_used = jiffies;
-
-	ino->sbi = sbi;
-
-	if (reinit && ino->free)
-		(ino->free)(ino);
-
-	memset(&ino->u, 0, sizeof(ino->u));
-
-	ino->free = NULL;
-
-	if (S_ISLNK(mode))
-		ino->free = ino_lnkfree;
-
-	return ino;
 }
 
 void autofs4_free_ino(struct autofs_info *ino)
 {
-	struct autofs_info *p_ino;
-
-	if (ino->dentry) {
-		ino->dentry->d_fsdata = NULL;
-		if (ino->dentry->d_inode) {
-			struct dentry *parent = ino->dentry->d_parent;
-			if (atomic_dec_and_test(&ino->count)) {
-				p_ino = autofs4_dentry_ino(parent);
-				if (p_ino && parent != ino->dentry)
-					atomic_dec(&p_ino->count);
-			}
-			dput(ino->dentry);
-		}
-		ino->dentry = NULL;
-	}
-	if (ino->free)
-		(ino->free)(ino);
 	kfree(ino);
 }
 
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
+static void autofs4_evict_inode(struct inode *inode)
+{
+	end_writeback(inode);
+	kfree(inode->i_private);
+}
+
 static const struct super_operations autofs4_sops = {
 	.statfs		= simple_statfs,
 	.show_options	= autofs4_show_options,
+	.evict_inode	= autofs4_evict_inode,
 };
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 	return (*pipefd < 0);
 }
 
-static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
-{
-	struct autofs_info *ino;
-
-	ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
-	if (!ino)
-		return NULL;
-
-	return ino;
-}
-
-static const struct dentry_operations autofs4_sb_dentry_operations = {
-	.d_release      = autofs4_dentry_release,
-};
-
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	s->s_blocksize_bits = 10;
 	s->s_magic = AUTOFS_SUPER_MAGIC;
 	s->s_op = &autofs4_sops;
+	s->s_d_op = &autofs4_dentry_operations;
 	s->s_time_gran = 1;
 
 	/*
 	 * Get the root inode and dentry, but defer checking for errors.
 	 */
-	ino = autofs4_mkroot(sbi);
+	ino = autofs4_new_ino(sbi);
 	if (!ino)
 		goto fail_free;
-	root_inode = autofs4_get_inode(s, ino);
+	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
 	if (!root_inode)
 		goto fail_ino;
 
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_iput;
 	pipe = NULL;
 
-	d_set_d_op(root, &autofs4_sb_dentry_operations);
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_dput;
 	}
 
+	if (autofs_type_trigger(sbi->type))
+		__managed_dentry_set_managed(root);
+
 	root_inode->i_fop = &autofs4_root_operations;
-	root_inode->i_op = autofs_type_trigger(sbi->type) ?
-			&autofs4_direct_root_inode_operations :
-			&autofs4_indirect_root_inode_operations;
+	root_inode->i_op = &autofs4_dir_inode_operations;
 
 	/* Couldn't this be tested earlier? */
 	if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
 	return -EINVAL;
 }
 
-struct inode *autofs4_get_inode(struct super_block *sb,
-				struct autofs_info *inf)
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
 {
 	struct inode *inode = new_inode(sb);
 
 	if (inode == NULL)
 		return NULL;
 
-	inf->inode = inode;
-	inode->i_mode = inf->mode;
+	inode->i_mode = mode;
 	if (sb->s_root) {
 		inode->i_uid = sb->s_root->d_inode->i_uid;
 		inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = get_next_ino();
 
-	if (S_ISDIR(inf->mode)) {
+	if (S_ISDIR(mode)) {
 		inode->i_nlink = 2;
 		inode->i_op = &autofs4_dir_inode_operations;
 		inode->i_fop = &autofs4_dir_operations;
-	} else if (S_ISLNK(inf->mode)) {
-		inode->i_size = inf->size;
+	} else if (S_ISLNK(mode)) {
 		inode->i_op = &autofs4_symlink_inode_operations;
 	}
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b..014e7aba3b0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
-static void *autofs4_follow_link(struct dentry *, struct nameidata *);
-
-#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
-#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool, bool);
+static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -60,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
 	.llseek		= dcache_dir_lseek,
 };
 
-const struct inode_operations autofs4_indirect_root_inode_operations = {
+const struct inode_operations autofs4_dir_inode_operations = {
 	.lookup		= autofs4_lookup,
 	.unlink		= autofs4_dir_unlink,
 	.symlink	= autofs4_dir_symlink,
@@ -68,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
 	.rmdir		= autofs4_dir_rmdir,
 };
 
-const struct inode_operations autofs4_direct_root_inode_operations = {
-	.lookup		= autofs4_lookup,
-	.unlink		= autofs4_dir_unlink,
-	.mkdir		= autofs4_dir_mkdir,
-	.rmdir		= autofs4_dir_rmdir,
-	.follow_link	= autofs4_follow_link,
-};
-
-const struct inode_operations autofs4_dir_inode_operations = {
-	.lookup		= autofs4_lookup,
-	.unlink		= autofs4_dir_unlink,
-	.symlink	= autofs4_dir_symlink,
-	.mkdir		= autofs4_dir_mkdir,
-	.rmdir		= autofs4_dir_rmdir,
+const struct dentry_operations autofs4_dentry_operations = {
+	.d_automount	= autofs4_d_automount,
+	.d_manage	= autofs4_d_manage,
+	.d_release	= autofs4_dentry_release,
 };
 
 static void autofs4_add_active(struct dentry *dentry)
@@ -116,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
 	return;
 }
 
-static unsigned int autofs4_need_mount(unsigned int flags)
-{
-	unsigned int res = 0;
-	if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
-		res = 1;
-	return res;
-}
-
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
@@ -158,278 +139,27 @@ out:
 	return dcache_dir_open(inode, file);
 }
 
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	int status;
-
-	DPRINTK("dentry=%p %.*s ino=%p",
-		 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-
-	/*
-	 * Wait for a pending mount, triggering one if there
-	 * isn't one already
-	 */
-	if (dentry->d_inode == NULL) {
-		DPRINTK("waiting for mount name=%.*s",
-			 dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
-		DPRINTK("mount done status=%d", status);
-
-		/* Turn this into a real negative dentry? */
-		if (status == -ENOENT) {
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
-			return status;
-		} else if (status) {
-			/* Return a negative dentry, but leave it "pending" */
-			return status;
-		}
-	/* Trigger mount for path component or follow link */
-	} else if (ino->flags & AUTOFS_INF_PENDING ||
-			autofs4_need_mount(flags)) {
-		DPRINTK("waiting for mount name=%.*s",
-			dentry->d_name.len, dentry->d_name.name);
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags |= AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
-		DPRINTK("mount done status=%d", status);
-
-		if (status) {
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
-			return status;
-		}
-	}
-
-	/* Initialize expiry counter after successful mount */
-	ino->last_used = jiffies;
-
-	spin_lock(&sbi->fs_lock);
-	ino->flags &= ~AUTOFS_INF_PENDING;
-	spin_unlock(&sbi->fs_lock);
-
-	return 0;
-}
-
-/* For autofs direct mounts the follow link triggers the mount */
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	int oz_mode = autofs4_oz_mode(sbi);
-	unsigned int lookup_type;
-	int status;
-
-	DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
-		dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
-		nd->flags);
-	/*
-	 * For an expire of a covered direct or offset mount we need
-	 * to break out of follow_down() at the autofs mount trigger
-	 * (d_mounted--), so we can see the expiring flag, and manage
-	 * the blocking and following here until the expire is completed.
-	 */
-	if (oz_mode) {
-		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_EXPIRING) {
-			spin_unlock(&sbi->fs_lock);
-			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path))
-				goto done;
-			goto follow;
-		}
-		spin_unlock(&sbi->fs_lock);
-		goto done;
-	}
-
-	/* If an expire request is pending everyone must wait. */
-	autofs4_expire_wait(dentry);
-
-	/* We trigger a mount for almost all flags */
-	lookup_type = autofs4_need_mount(nd->flags);
-	spin_lock(&sbi->fs_lock);
-	spin_lock(&autofs4_lock);
-	spin_lock(&dentry->d_lock);
-	if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
-		spin_unlock(&sbi->fs_lock);
-		goto follow;
-	}
-
-	/*
-	 * If the dentry contains directories then it is an autofs
-	 * multi-mount with no root mount offset. So don't try to
-	 * mount it again.
-	 */
-	if (ino->flags & AUTOFS_INF_PENDING ||
-	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
-		spin_unlock(&sbi->fs_lock);
-
-		status = try_to_fill_dentry(dentry, nd->flags);
-		if (status)
-			goto out_error;
-
-		goto follow;
-	}
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
-	spin_unlock(&sbi->fs_lock);
-follow:
-	/*
-	 * If there is no root mount it must be an autofs
-	 * multi-mount with no root offset so we don't need
-	 * to follow it.
-	 */
-	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path)) {
-			status = -ENOENT;
-			goto out_error;
-		}
-	}
-
-done:
-	return NULL;
-
-out_error:
-	path_put(&nd->path);
-	return ERR_PTR(status);
-}
-
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
+static void autofs4_dentry_release(struct dentry *de)
 {
-	struct inode *dir;
-	struct autofs_sb_info *sbi;
-	int oz_mode;
-	int flags = nd ? nd->flags : 0;
-	int status = 1;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	dir = dentry->d_parent->d_inode;
-	sbi = autofs4_sbi(dir->i_sb);
-	oz_mode = autofs4_oz_mode(sbi);
-
-	/* Pending dentry */
-	spin_lock(&sbi->fs_lock);
-	if (autofs4_ispending(dentry)) {
-		/* The daemon never causes a mount to trigger */
-		spin_unlock(&sbi->fs_lock);
-
-		if (oz_mode)
-			return 1;
-
-		/*
-		 * If the directory has gone away due to an expire
-		 * we have been called as ->d_revalidate() and so
-		 * we need to return false and proceed to ->lookup().
-		 */
-		if (autofs4_expire_wait(dentry) == -EAGAIN)
-			return 0;
-
-		/*
-		 * A zero status is success otherwise we have a
-		 * negative error code.
-		 */
-		status = try_to_fill_dentry(dentry, flags);
-		if (status == 0)
-			return 1;
-
-		return status;
-	}
-	spin_unlock(&sbi->fs_lock);
-
-	/* Negative dentry.. invalidate if "old" */
-	if (dentry->d_inode == NULL)
-		return 0;
-
-	/* Check for a non-mountpoint directory with no contents */
-	spin_lock(&autofs4_lock);
-	spin_lock(&dentry->d_lock);
-	if (S_ISDIR(dentry->d_inode->i_mode) &&
-	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-		DPRINTK("dentry=%p %.*s, emptydir",
-			 dentry, dentry->d_name.len, dentry->d_name.name);
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
-
-		/* The daemon never causes a mount to trigger */
-		if (oz_mode)
-			return 1;
-
-		/*
-		 * A zero status is success otherwise we have a
-		 * negative error code.
-		 */
-		status = try_to_fill_dentry(dentry, flags);
-		if (status == 0)
-			return 1;
-
-		return status;
-	}
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
-
-	return 1;
-}
-
-void autofs4_dentry_release(struct dentry *de)
-{
-	struct autofs_info *inf;
+	struct autofs_info *ino = autofs4_dentry_ino(de);
+	struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
 
 	DPRINTK("releasing %p", de);
 
-	inf = autofs4_dentry_ino(de);
-	de->d_fsdata = NULL;
-
-	if (inf) {
-		struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-
-		if (sbi) {
-			spin_lock(&sbi->lookup_lock);
-			if (!list_empty(&inf->active))
-				list_del(&inf->active);
-			if (!list_empty(&inf->expiring))
-				list_del(&inf->expiring);
-			spin_unlock(&sbi->lookup_lock);
-		}
-
-		inf->dentry = NULL;
-		inf->inode = NULL;
+	if (!ino)
+		return;
 
-		autofs4_free_ino(inf);
+	if (sbi) {
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->active))
+			list_del(&ino->active);
+		if (!list_empty(&ino->expiring))
+			list_del(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
 	}
-}
 
-/* For dentries of directories in the root dir */
-static const struct dentry_operations autofs4_root_dentry_operations = {
-	.d_revalidate	= autofs4_revalidate,
-	.d_release	= autofs4_dentry_release,
-};
-
-/* For other dentries */
-static const struct dentry_operations autofs4_dentry_operations = {
-	.d_revalidate	= autofs4_revalidate,
-	.d_release	= autofs4_dentry_release,
-};
+	autofs4_free_ino(ino);
+}
 
 static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
@@ -541,51 +271,246 @@ next:
 	return NULL;
 }
 
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		DPRINTK("waiting for mount name=%.*s",
+			dentry->d_name.len, dentry->d_name.name);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		DPRINTK("mount wait done status=%d", status);
+		ino->last_used = jiffies;
+		return status;
+	}
+	return 0;
+}
+
+static int do_expire_wait(struct dentry *dentry)
+{
+	struct dentry *expiring;
+
+	expiring = autofs4_lookup_expiring(dentry);
+	if (!expiring)
+		return autofs4_expire_wait(dentry);
+	else {
+		/*
+		 * If we are racing with expire the request might not
+		 * be quite complete, but the directory has been removed
+		 * so it must have been successful, just wait for it.
+		 */
+		autofs4_expire_wait(expiring);
+		autofs4_del_expiring(expiring);
+		dput(expiring);
+	}
+	return 0;
+}
+
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	/*
+	 * If this is an indirect mount the dentry could have gone away
+	 * as a result of an expire and a new one created.
+	 */
+	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+		struct dentry *parent = dentry->d_parent;
+		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		if (!new)
+			return NULL;
+		dput(path->dentry);
+		path->dentry = new;
+	}
+	return path->dentry;
+}
+
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	DPRINTK("dentry=%p %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	/*
+	 * Someone may have manually umounted this or it was a submount
+	 * that has gone away.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+		if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+		     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+			__managed_dentry_set_transit(path->dentry);
+	}
+	spin_unlock(&dentry->d_lock);
+
+	/* The daemon never triggers a mount. */
+	if (autofs4_oz_mode(sbi))
+		return NULL;
+
+	/*
+	 * If an expire request is pending everyone must wait.
+	 * If the expire fails we're still mounted so continue
+	 * the follow and return. A return of -EAGAIN (which only
+	 * happens with indirect mounts) means the expire completed
+	 * and the directory was removed, so just go ahead and try
+	 * the mount.
+	 */
+	status = do_expire_wait(dentry);
+	if (status && status != -EAGAIN)
+		return NULL;
+
+	/* Callback to the daemon to perform the mount or wait */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_PENDING) {
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry);
+		if (status)
+			return ERR_PTR(status);
+		spin_lock(&sbi->fs_lock);
+		goto done;
+	}
+
+	/*
+	 * If the dentry is a symlink it's equivalent to a directory
+	 * having d_mountpoint() true, so there's no need to call back
+	 * to the daemon.
+	 */
+	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+		goto done;
+	if (!d_mountpoint(dentry)) {
+		/*
+		 * It's possible that user space hasn't removed directories
+		 * after umounting a rootless multi-mount, although it
+		 * should. For v5 have_submounts() is sufficient to handle
+		 * this because the leaves of the directory tree under the
+		 * mount never trigger mounts themselves (they have an autofs
+		 * trigger mount mounted on them). But v4 pseudo direct mounts
+		 * do need the leaves to to trigger mounts. In this case we
+		 * have no choice but to use the list_empty() check and
+		 * require user space behave.
+		 */
+		if (sbi->version > 4) {
+			if (have_submounts(dentry))
+				goto done;
+		} else {
+			spin_lock(&dentry->d_lock);
+			if (!list_empty(&dentry->d_subdirs)) {
+				spin_unlock(&dentry->d_lock);
+				goto done;
+			}
+			spin_unlock(&dentry->d_lock);
+		}
+		ino->flags |= AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_mount_wait(dentry);
+		if (status)
+			return ERR_PTR(status);
+		spin_lock(&sbi->fs_lock);
+		ino->flags &= ~AUTOFS_INF_PENDING;
+	}
+done:
+	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+		/*
+		 * Any needed mounting has been completed and the path updated
+		 * so turn this into a normal dentry so we don't continually
+		 * call ->d_automount() and ->d_manage().
+		 */
+		spin_lock(&dentry->d_lock);
+		__managed_dentry_clear_transit(dentry);
+		/*
+		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+		 * symlinks as in all other cases the dentry will be covered by
+		 * an actual mount so ->d_automount() won't be called during
+		 * the follow.
+		 */
+		if ((!d_mountpoint(dentry) &&
+		    !list_empty(&dentry->d_subdirs)) ||
+		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+			__managed_dentry_clear_automount(dentry);
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	/* Mount succeeded, check if we ended up with a new dentry */
+	dentry = autofs4_mountpoint_changed(path);
+	if (!dentry)
+		return ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+	DPRINTK("dentry=%p %.*s",
+		dentry, dentry->d_name.len, dentry->d_name.name);
+
+	/* The daemon never waits. */
+	if (autofs4_oz_mode(sbi) || mounting_here) {
+		if (!d_mountpoint(dentry))
+			return -EISDIR;
+		return 0;
+	}
+
+	/* We need to sleep, so we need pathwalk to be in ref-mode */
+	if (rcu_walk)
+		return -ECHILD;
+
+	/* Wait for pending expires */
+	do_expire_wait(dentry);
+
+	/*
+	 * This dentry may be under construction so wait on mount
+	 * completion.
+	 */
+	return autofs4_mount_wait(dentry);
+}
+
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
-	struct dentry *expiring, *active;
-	int oz_mode;
+	struct dentry *active;
 
-	DPRINTK("name = %.*s",
-		dentry->d_name.len, dentry->d_name.name);
+	DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
 
 	/* File name too long to exist */
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	sbi = autofs4_sbi(dir->i_sb);
-	oz_mode = autofs4_oz_mode(sbi);
 
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
-		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+		current->pid, task_pgrp_nr(current), sbi->catatonic,
+		autofs4_oz_mode(sbi));
 
 	active = autofs4_lookup_active(dentry);
 	if (active) {
-		dentry = active;
-		ino = autofs4_dentry_ino(dentry);
+		return active;
 	} else {
 		/*
-		 * Mark the dentry incomplete but don't hash it. We do this
-		 * to serialize our inode creation operations (symlink and
-		 * mkdir) which prevents deadlock during the callback to
-		 * the daemon. Subsequent user space lookups for the same
-		 * dentry are placed on the wait queue while the daemon
-		 * itself is allowed passage unresticted so the create
-		 * operation itself can then hash the dentry. Finally,
-		 * we check for the hashed dentry and return the newly
-		 * hashed dentry.
+		 * A dentry that is not within the root can never trigger a
+		 * mount operation, unless the directory already exists, so we
+		 * can return fail immediately.  The daemon however does need
+		 * to create directories within the file system.
 		 */
-		d_set_d_op(dentry, &autofs4_root_dentry_operations);
+		if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+			return ERR_PTR(-ENOENT);
 
-		/*
-		 * And we need to ensure that the same dentry is used for
-		 * all following lookup calls until it is hashed so that
-		 * the dentry flags are persistent throughout the request.
-		 */
-		ino = autofs4_init_ino(NULL, sbi, 0555);
+		/* Mark entries in the root as mount triggers */
+		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+			__managed_dentry_set_managed(dentry);
+
+		ino = autofs4_new_ino(sbi);
 		if (!ino)
 			return ERR_PTR(-ENOMEM);
 
@@ -596,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 
 		d_instantiate(dentry, NULL);
 	}
-
-	if (!oz_mode) {
-		mutex_unlock(&dir->i_mutex);
-		expiring = autofs4_lookup_expiring(dentry);
-		if (expiring) {
-			/*
-			 * If we are racing with expire the request might not
-			 * be quite complete but the directory has been removed
-			 * so it must have been successful, so just wait for it.
-			 */
-			autofs4_expire_wait(expiring);
-			autofs4_del_expiring(expiring);
-			dput(expiring);
-		}
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags |= AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-		if (dentry->d_op && dentry->d_op->d_revalidate)
-			(dentry->d_op->d_revalidate)(dentry, nd);
-		mutex_lock(&dir->i_mutex);
-	}
-
-	/*
-	 * If we are still pending, check if we had to handle
-	 * a signal. If so we can force a restart..
-	 */
-	if (ino->flags & AUTOFS_INF_PENDING) {
-		/* See if we were interrupted */
-		if (signal_pending(current)) {
-			sigset_t *sigset = &current->pending.signal;
-			if (sigismember (sigset, SIGKILL) ||
-			    sigismember (sigset, SIGQUIT) ||
-			    sigismember (sigset, SIGINT)) {
-			    if (active)
-				dput(active);
-			    return ERR_PTR(-ERESTARTNOINTR);
-			}
-		}
-		if (!oz_mode) {
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
-		}
-	}
-
-	/*
-	 * If this dentry is unhashed, then we shouldn't honour this
-	 * lookup.  Returning ENOENT here doesn't do the right thing
-	 * for all system calls, but it should be OK for the operations
-	 * we permit from an autofs.
-	 */
-	if (!oz_mode && d_unhashed(dentry)) {
-		/*
-		 * A user space application can (and has done in the past)
-		 * remove and re-create this directory during the callback.
-		 * This can leave us with an unhashed dentry, but a
-		 * successful mount!  So we need to perform another
-		 * cached lookup in case the dentry now exists.
-		 */
-		struct dentry *parent = dentry->d_parent;
-		struct dentry *new = d_lookup(parent, &dentry->d_name);
-		if (new != NULL)
-			dentry = new;
-		else
-			dentry = ERR_PTR(-ENOENT);
-
-		if (active)
-			dput(active);
-
-		return dentry;
-	}
-
-	if (active)
-		return active;
-
 	return NULL;
 }
 
@@ -683,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 	struct inode *inode;
+	size_t size = strlen(symname);
 	char *cp;
 
 	DPRINTK("%s <- %.*s", symname,
@@ -691,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
-	ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
-	if (!ino)
-		return -ENOMEM;
+	BUG_ON(!ino);
+
+	autofs4_clean_ino(ino);
 
 	autofs4_del_active(dentry);
 
-	ino->size = strlen(symname);
-	cp = kmalloc(ino->size + 1, GFP_KERNEL);
-	if (!cp) {
-		if (!dentry->d_fsdata)
-			kfree(ino);
+	cp = kmalloc(size + 1, GFP_KERNEL);
+	if (!cp)
 		return -ENOMEM;
-	}
 
 	strcpy(cp, symname);
 
-	inode = autofs4_get_inode(dir->i_sb, ino);
+	inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
 	if (!inode) {
 		kfree(cp);
 		if (!dentry->d_fsdata)
 			kfree(ino);
 		return -ENOMEM;
 	}
+	inode->i_private = cp;
+	inode->i_size = size;
 	d_add(dentry, inode);
 
-	if (dir == dir->i_sb->s_root->d_inode)
-		d_set_d_op(dentry, &autofs4_root_dentry_operations);
-	else
-		d_set_d_op(dentry, &autofs4_dentry_operations);
-
-	dentry->d_fsdata = ino;
-	ino->dentry = dget(dentry);
+	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
-	ino->inode = inode;
 
-	ino->u.symlink = cp;
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
@@ -782,6 +622,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is neccessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	/* root and dentrys in the root are already handled */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_set_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	managed_dentry_clear_managed(parent);
+	return;
+}
+
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+	struct list_head *d_child;
+	struct dentry *parent;
+
+	/* flags for dentrys in the root are handled elsewhere */
+	if (IS_ROOT(dentry->d_parent))
+		return;
+
+	managed_dentry_clear_managed(dentry);
+
+	parent = dentry->d_parent;
+	/* only consider parents below dentrys in the root */
+	if (IS_ROOT(parent->d_parent))
+		return;
+	d_child = &dentry->d_u.d_child;
+	/* Set parent managed if it's becoming empty */
+	if (d_child->next == &parent->d_subdirs &&
+	    d_child->prev == &parent->d_subdirs)
+		managed_dentry_set_managed(parent);
+	return;
+}
+
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -809,6 +701,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&autofs4_lock);
 
+	if (sbi->version < 5)
+		autofs_clear_leaf_automount_flags(dentry);
+
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
 		if (p_ino && dentry->d_parent != dentry)
@@ -837,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DPRINTK("dentry %p, creating %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
-	ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
-	if (!ino)
-		return -ENOMEM;
+	BUG_ON(!ino);
+
+	autofs4_clean_ino(ino);
 
 	autofs4_del_active(dentry);
 
-	inode = autofs4_get_inode(dir->i_sb, ino);
-	if (!inode) {
-		if (!dentry->d_fsdata)
-			kfree(ino);
+	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
+	if (!inode)
 		return -ENOMEM;
-	}
 	d_add(dentry, inode);
 
-	if (dir == dir->i_sb->s_root->d_inode)
-		d_set_d_op(dentry, &autofs4_root_dentry_operations);
-	else
-		d_set_d_op(dentry, &autofs4_dentry_operations);
+	if (sbi->version < 5)
+		autofs_set_leaf_automount_flags(dentry);
 
-	dentry->d_fsdata = ino;
-	ino->dentry = dget(dentry);
+	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
-	ino->inode = inode;
 	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
 
@@ -944,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 int is_autofs4_dentry(struct dentry *dentry)
 {
 	return dentry && dentry->d_inode &&
-		(dentry->d_op == &autofs4_root_dentry_operations ||
-		 dentry->d_op == &autofs4_dentry_operations) &&
+		dentry->d_op == &autofs4_dentry_operations &&
 		dentry->d_fsdata != NULL;
 }
 
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2..f27c094a191 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
 
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	nd_set_link(nd, (char *)ino->u.symlink);
+	nd_set_link(nd, dentry->d_inode->i_private);
 	return NULL;
 }
 
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c5f8459c905..56010056b2e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 	 * completed while we waited on the mutex ...
 	 */
 	if (notify == NFY_MOUNT) {
+		struct dentry *new = NULL;
+		int valid = 1;
+
 		/*
 		 * If the dentry was successfully mounted while we slept
 		 * on the wait queue mutex we can return success. If it
@@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
 		 * a multi-mount with no mount at it's base) we can
 		 * continue on and create a new request.
 		 */
+		if (!IS_ROOT(dentry)) {
+			if (dentry->d_inode && d_unhashed(dentry)) {
+				struct dentry *parent = dentry->d_parent;
+				new = d_lookup(parent, &dentry->d_name);
+				if (new)
+					dentry = new;
+			}
+		}
 		if (have_submounts(dentry))
-			return 0;
+			valid = 0;
+
+		if (new)
+			dput(new);
+		return valid;
 	}
 
 	return 1;
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d0..27223878ba9 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
 {
 	befs_data_stream data;
 	int i;
 
 	for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
-		data.direct[i] = fsrun_to_cpu(sb, n.direct[i]);
+		data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
 
-	data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range);
-	data.indirect = fsrun_to_cpu(sb, n.indirect);
-	data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range);
-	data.double_indirect = fsrun_to_cpu(sb, n.double_indirect);
+	data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
+	data.indirect = fsrun_to_cpu(sb, n->indirect);
+	data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
+	data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
 	data.max_double_indirect_range = fs64_to_cpu(sb,
-						     n.
+						     n->
 						     max_double_indirect_range);
-	data.size = fs64_to_cpu(sb, n.size);
+	data.size = fs64_to_cpu(sb, n->size);
 
 	return data;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index de93581b79a..b1d0c794747 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -390,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		int num_blks;
 
 		befs_ino->i_data.ds =
-		    fsds_to_cpu(sb, raw_inode->data.datastream);
+		    fsds_to_cpu(sb, &raw_inode->data.datastream);
 
 		num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
 		inode->i_blocks =
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c..d5b640ba6cb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 
 static struct linux_binfmt elf_format = {
-		.module		= THIS_MODULE,
-		.load_binary	= load_elf_binary,
-		.load_shlib	= load_elf_library,
-		.core_dump	= elf_core_dump,
-		.min_coredump	= ELF_EXEC_PAGESIZE,
-		.hasvdso	= 1
+	.module		= THIS_MODULE,
+	.load_binary	= load_elf_binary,
+	.load_shlib	= load_elf_library,
+	.core_dump	= elf_core_dump,
+	.min_coredump	= ELF_EXEC_PAGESIZE,
 };
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	return 0;
 }
 
-#ifndef elf_map
-
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	return(map_addr);
 }
 
-#endif /* !elf_map */
-
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
 	int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		goto out;
 
 	retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-			     (char *)elf_phdata,size);
+			     (char *)elf_phdata, size);
 	error = -EIO;
 	if (retval != size) {
 		if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
-	if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			/* There was a PT_LOAD segment with p_memsz > p_filesz
 			   before this one. Map anonymous pages, if needed,
 			   and clear the area.  */
-			retval = set_brk (elf_bss + load_bias,
-					  elf_brk + load_bias);
+			retval = set_brk(elf_bss + load_bias,
+					 elf_brk + load_bias);
 			if (retval) {
 				send_sig(SIGKILL, current, 0);
 				goto out_free_dentry;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b..e49cce234c6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
 {
 	unsigned int i;
 
-	kintegrityd_wq = create_workqueue("kintegrityd");
+	/*
+	 * kintegrityd won't block much but may burn a lot of CPU cycles.
+	 * Make it highpri CPU intensive wq with max concurrency of 1.
+	 */
+	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
 	if (!kintegrityd_wq)
 		panic("Failed to create kintegrityd\n");
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 771f2352701..333a7bb4cb9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -433,7 +433,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&bdev->bd_inodes);
 	INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
-	INIT_LIST_HEAD(&bdev->bd_holder_list);
+	INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
@@ -473,7 +473,7 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
 }
 
 static struct file_system_type bd_type = {
@@ -669,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 	else if (bdev->bd_contains == bdev)
 		return true;  	 /* is a whole device which isn't held */
 
-	else if (whole->bd_holder == bd_claim)
+	else if (whole->bd_holder == bd_may_claim)
 		return true; 	 /* is a partition of a device that is being partitioned */
 	else if (whole->bd_holder != NULL)
 		return false;	 /* is a partition of a held device */
@@ -781,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 	}
 }
 
-/* releases bdev_lock */
-static void __bd_abort_claiming(struct block_device *whole, void *holder)
-{
-	BUG_ON(whole->bd_claiming != holder);
-	whole->bd_claiming = NULL;
-	wake_up_bit(&whole->bd_claiming, 0);
-
-	spin_unlock(&bdev_lock);
-	bdput(whole);
-}
-
-/**
- * bd_abort_claiming - abort claiming a block device
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Abort a claiming block started by bd_start_claiming().  Note that
- * @whole is not the block device to be claimed but the whole device
- * returned by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_abort_claiming(struct block_device *whole, void *holder)
-{
-	spin_lock(&bdev_lock);
-	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
-}
-
-/* increment holders when we have a legitimate claim. requires bdev_lock */
-static void __bd_claim(struct block_device *bdev, struct block_device *whole,
-					void *holder)
-{
-	/* note that for a whole device bd_holders
-	 * will be incremented twice, and bd_holder will
-	 * be set to bd_claim before being set to holder
-	 */
-	whole->bd_holders++;
-	whole->bd_holder = bd_claim;
-	bdev->bd_holders++;
-	bdev->bd_holder = holder;
-}
-
-/**
- * bd_finish_claiming - finish claiming a block device
- * @bdev: block device of interest (passed to bd_start_claiming())
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Finish a claiming block started by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_finish_claiming(struct block_device *bdev,
-				struct block_device *whole, void *holder)
-{
-	spin_lock(&bdev_lock);
-	BUG_ON(!bd_may_claim(bdev, whole, holder));
-	__bd_claim(bdev, whole, holder);
-	__bd_abort_claiming(whole, holder); /* not actually an abort */
-}
+#ifdef CONFIG_SYSFS
+struct bd_holder_disk {
+	struct list_head	list;
+	struct gendisk		*disk;
+	int			refcnt;
+};
 
-/**
- * bd_claim - claim a block device
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+						  struct gendisk *disk)
 {
-	struct block_device *whole = bdev->bd_contains;
-	int res;
+	struct bd_holder_disk *holder;
 
-	might_sleep();
-
-	spin_lock(&bdev_lock);
-	res = bd_prepare_to_claim(bdev, whole, holder);
-	if (res == 0)
-		__bd_claim(bdev, whole, holder);
-	spin_unlock(&bdev_lock);
-
-	return res;
-}
-EXPORT_SYMBOL(bd_claim);
-
-void bd_release(struct block_device *bdev)
-{
-	spin_lock(&bdev_lock);
-	if (!--bdev->bd_contains->bd_holders)
-		bdev->bd_contains->bd_holder = NULL;
-	if (!--bdev->bd_holders)
-		bdev->bd_holder = NULL;
-	spin_unlock(&bdev_lock);
+	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+		if (holder->disk == disk)
+			return holder;
+	return NULL;
 }
 
-EXPORT_SYMBOL(bd_release);
-
-#ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- *     If a kobject is passed to bd_claim_by_kobject()
- *     and the kobject has a parent directory,
- *     following symlinks are created:
- *        o from the kobject to the claimed bdev
- *        o from "holders" directory of the bdev to the parent of the kobject
- *     bd_release_from_kobject() removes these symlinks.
- *
- *     Example:
- *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
- *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
-	if (!from || !to)
-		return 0;
 	return sysfs_create_link(from, to, kobject_name(to));
 }
 
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
-	if (!from || !to)
-		return;
 	sysfs_remove_link(from, kobject_name(to));
 }
 
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
-	struct list_head list;	/* chain of holders of the bdev */
-	int count;		/* references from the holder */
-	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
-	struct kobject *hdev;	/* e.g. "/block/dm-0" */
-	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
-	struct kobject *sdev;	/* e.g. "/block/sda" */
-};
-
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
-			struct bd_holder *bo)
-{
-	if (!bdev || !bo)
-		return 0;
-
-	bo->sdir = kobject_get(bo->sdir);
-	if (!bo->sdir)
-		return 0;
-
-	bo->hdev = kobject_get(bo->sdir->parent);
-	if (!bo->hdev)
-		goto fail_put_sdir;
-
-	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-	if (!bo->sdev)
-		goto fail_put_hdev;
-
-	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
-	if (!bo->hdir)
-		goto fail_put_sdev;
-
-	return 1;
-
-fail_put_sdev:
-	kobject_put(bo->sdev);
-fail_put_hdev:
-	kobject_put(bo->hdev);
-fail_put_sdir:
-	kobject_put(bo->sdir);
-
-	return 0;
-}
-
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
-	kobject_put(bo->hdir);
-	kobject_put(bo->sdev);
-	kobject_put(bo->hdev);
-	kobject_put(bo->sdir);
-}
-
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
-	struct bd_holder *bo;
-
-	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
-	if (!bo)
-		return NULL;
-
-	bo->count = 1;
-	bo->sdir = kobj;
-
-	return bo;
-}
-
-static void free_bd_holder(struct bd_holder *bo)
-{
-	kfree(bo);
-}
-
 /**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
  *
- * @bdev:	struct block device to be searched
- * @bo:		target struct bd_holder
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
  *
- * Returns matching entry with @bo in @bdev->bd_holder_list.
- * If found, increment the reference count and return the pointer.
- * If not found, returns NULL.
- */
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
-					struct bd_holder *bo)
-{
-	struct bd_holder *tmp;
-
-	list_for_each_entry(tmp, &bdev->bd_holder_list, list)
-		if (tmp->sdir == bo->sdir) {
-			tmp->count++;
-			return tmp;
-		}
-
-	return NULL;
-}
-
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
+ * This functions creates the following sysfs symlinks.
+ *
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
+ *
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
  *
- * @bdev:	block device to be bd_claimed
- * @bo:		preallocated and initialized by alloc_bd_holder()
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  *
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
  *
- * Returns 0 if symlinks are created.
- * Returns -ve if something fails.
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
  */
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-	int err;
+	struct bd_holder_disk *holder;
+	int ret = 0;
 
-	if (!bo)
-		return -EINVAL;
+	mutex_lock(&bdev->bd_mutex);
 
-	if (!bd_holder_grab_dirs(bdev, bo))
-		return -EBUSY;
+	WARN_ON_ONCE(!bdev->bd_holder);
 
-	err = add_symlink(bo->sdir, bo->sdev);
-	if (err)
-		return err;
+	/* FIXME: remove the following once add_disk() handles errors */
+	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+		goto out_unlock;
 
-	err = add_symlink(bo->hdir, bo->hdev);
-	if (err) {
-		del_symlink(bo->sdir, bo->sdev);
-		return err;
+	holder = bd_find_holder_disk(bdev, disk);
+	if (holder) {
+		holder->refcnt++;
+		goto out_unlock;
 	}
 
-	list_add_tail(&bo->list, &bdev->bd_holder_list);
-	return 0;
-}
-
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev:	block device to be bd_claimed
- * @kobj:	holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
-					struct kobject *kobj)
-{
-	struct bd_holder *bo;
-
-	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
-		if (bo->sdir == kobj) {
-			bo->count--;
-			BUG_ON(bo->count < 0);
-			if (!bo->count) {
-				list_del(&bo->list);
-				del_symlink(bo->sdir, bo->sdev);
-				del_symlink(bo->hdir, bo->hdev);
-				bd_holder_release_dirs(bo);
-				return bo;
-			}
-			break;
-		}
+	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+	if (!holder) {
+		ret = -ENOMEM;
+		goto out_unlock;
 	}
 
-	return NULL;
-}
-
-/**
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
- *
- * @bdev:	block device to be claimed
- * @holder:	holder's signature
- * @kobj:	holder's kobject
- *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
- *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
- */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
-				struct kobject *kobj)
-{
-	int err;
-	struct bd_holder *bo, *found;
-
-	if (!kobj)
-		return -EINVAL;
-
-	bo = alloc_bd_holder(kobj);
-	if (!bo)
-		return -ENOMEM;
+	INIT_LIST_HEAD(&holder->list);
+	holder->disk = disk;
+	holder->refcnt = 1;
 
-	mutex_lock(&bdev->bd_mutex);
+	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+	if (ret)
+		goto out_free;
 
-	err = bd_claim(bdev, holder);
-	if (err)
-		goto fail;
+	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+	if (ret)
+		goto out_del;
 
-	found = find_bd_holder(bdev, bo);
-	if (found)
-		goto fail;
+	list_add(&holder->list, &bdev->bd_holder_disks);
+	goto out_unlock;
 
-	err = add_bd_holder(bdev, bo);
-	if (err)
-		bd_release(bdev);
-	else
-		bo = NULL;
-fail:
+out_del:
+	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+out_free:
+	kfree(holder);
+out_unlock:
 	mutex_unlock(&bdev->bd_mutex);
-	free_bd_holder(bo);
-	return err;
+	return ret;
 }
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
 /**
- * bd_release_from_kobject - bd_release() with additional kobject signature
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
  *
- * @bdev:	block device to be released
- * @kobj:	holder's kobject
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
  *
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ * CONTEXT:
+ * Might sleep.
  */
-static void bd_release_from_kobject(struct block_device *bdev,
-					struct kobject *kobj)
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-	if (!kobj)
-		return;
+	struct bd_holder_disk *holder;
 
 	mutex_lock(&bdev->bd_mutex);
-	bd_release(bdev);
-	free_bd_holder(del_bd_holder(bdev, kobj));
-	mutex_unlock(&bdev->bd_mutex);
-}
 
-/**
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev:	block device to be claimed
- * @holder:	holder's signature
- * @disk:	holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
-			struct gendisk *disk)
-{
-	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
+	holder = bd_find_holder_disk(bdev, disk);
 
-/**
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
- *
- * @bdev:	block device to be claimed
- * @disk:	holder's gendisk
- *
- * Call bd_release_from_kobject() and put @disk->slave_dir.
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
-	bd_release_from_kobject(bdev, disk->slave_dir);
-	kobject_put(disk->slave_dir);
-}
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
-#endif
+	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+		del_symlink(bdev->bd_part->holder_dir,
+			    &disk_to_dev(disk)->kobj);
+		list_del_init(&holder->list);
+		kfree(holder);
+	}
 
-/*
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
-{
-	struct block_device *bdev = bdget(dev);
-	int err = -ENOMEM;
-	if (bdev)
-		err = blkdev_get(bdev, mode);
-	return err ? ERR_PTR(err) : bdev;
+	mutex_unlock(&bdev->bd_mutex);
 }
-
-EXPORT_SYMBOL(open_by_devnum);
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#endif
 
 /**
  * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1309,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	const struct block_device_operations *bdops = disk->fops;
+	unsigned int events;
 
-	if (!bdops->media_changed)
-		return 0;
-	if (!bdops->media_changed(bdev->bd_disk))
+	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+				   DISK_EVENT_EJECT_REQUEST);
+	if (!(events & DISK_EVENT_MEDIA_CHANGE))
 		return 0;
 
 	flush_disk(bdev);
@@ -1475,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-	return __blkdev_get(bdev, mode, 0);
+	struct block_device *whole = NULL;
+	int res;
+
+	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+	if ((mode & FMODE_EXCL) && holder) {
+		whole = bd_start_claiming(bdev, holder);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
+	res = __blkdev_get(bdev, mode, 0);
+
+	/* __blkdev_get() may alter read only status, check it afterwards */
+	if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		__blkdev_put(bdev, mode, 0);
+		res = -EACCES;
+	}
+
+	if (whole) {
+		/* finish claiming */
+		mutex_lock(&bdev->bd_mutex);
+		spin_lock(&bdev_lock);
+
+		if (!res) {
+			BUG_ON(!bd_may_claim(bdev, whole, holder));
+			/*
+			 * Note that for a whole device bd_holders
+			 * will be incremented twice, and bd_holder
+			 * will be set to bd_may_claim before being
+			 * set to holder
+			 */
+			whole->bd_holders++;
+			whole->bd_holder = bd_may_claim;
+			bdev->bd_holders++;
+			bdev->bd_holder = holder;
+		}
+
+		/* tell others that we're done */
+		BUG_ON(whole->bd_claiming != holder);
+		whole->bd_claiming = NULL;
+		wake_up_bit(&whole->bd_claiming, 0);
+
+		spin_unlock(&bdev_lock);
+
+		/*
+		 * Block event polling for write claims.  Any write
+		 * holder makes the write_holder state stick until all
+		 * are released.  This is good enough and tracking
+		 * individual writeable reference is too fragile given
+		 * the way @mode is used in blkdev_get/put().
+		 */
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+			bdev->bd_write_holder = true;
+			disk_block_events(bdev->bd_disk);
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
+		bdput(whole);
+	}
+
+	return res;
 }
 EXPORT_SYMBOL(blkdev_get);
 
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-	struct block_device *whole = NULL;
 	struct block_device *bdev;
-	int res;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -1506,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		whole = bd_start_claiming(bdev, filp);
-		if (IS_ERR(whole)) {
-			bdput(bdev);
-			return PTR_ERR(whole);
-		}
-	}
-
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
-	res = blkdev_get(bdev, filp->f_mode);
-
-	if (whole) {
-		if (res == 0)
-			bd_finish_claiming(bdev, whole, filp);
-		else
-			bd_abort_claiming(whole, filp);
-	}
-
-	return res;
+	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1539,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_part_count--;
 
 	if (!--bdev->bd_openers) {
+		WARN_ON_ONCE(bdev->bd_holders);
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
 	}
@@ -1569,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+	if (mode & FMODE_EXCL) {
+		bool bdev_free;
+
+		/*
+		 * Release a claim on the device.  The holder fields
+		 * are protected with bdev_lock.  bd_mutex is to
+		 * synchronize disk_holder unlinking.
+		 */
+		mutex_lock(&bdev->bd_mutex);
+		spin_lock(&bdev_lock);
+
+		WARN_ON_ONCE(--bdev->bd_holders < 0);
+		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+
+		/* bd_contains might point to self, check in a separate step */
+		if ((bdev_free = !bdev->bd_holders))
+			bdev->bd_holder = NULL;
+		if (!bdev->bd_contains->bd_holders)
+			bdev->bd_contains->bd_holder = NULL;
+
+		spin_unlock(&bdev_lock);
+
+		/*
+		 * If this was the last claim, remove holder link and
+		 * unblock evpoll if it was a write holder.
+		 */
+		if (bdev_free) {
+			if (bdev->bd_write_holder) {
+				disk_unblock_events(bdev->bd_disk);
+				bdev->bd_write_holder = false;
+			} else
+				disk_check_events(bdev->bd_disk);
+		}
+
+		mutex_unlock(&bdev->bd_mutex);
+	} else
+		disk_check_events(bdev->bd_disk);
+
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1576,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-	if (bdev->bd_holder == filp)
-		bd_release(bdev);
+
 	return blkdev_put(bdev, filp->f_mode);
 }
 
@@ -1722,67 +1601,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
 
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:	special file representing the block device
- * @mode:	FMODE_... combination to pass be used
- * @holder:	owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-	struct block_device *bdev, *whole;
-	int error;
-
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev))
-		return bdev;
-
-	whole = bd_start_claiming(bdev, holder);
-	if (IS_ERR(whole)) {
-		bdput(bdev);
-		return whole;
-	}
-
-	error = blkdev_get(bdev, mode);
-	if (error)
-		goto out_abort_claiming;
-
-	error = -EACCES;
-	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto out_blkdev_put;
-
-	bd_finish_claiming(bdev, whole, holder);
-	return bdev;
-
-out_blkdev_put:
-	blkdev_put(bdev, mode);
-out_abort_claiming:
-	bd_abort_claiming(whole, holder);
-	return ERR_PTR(error);
-}
-
-EXPORT_SYMBOL(open_bdev_exclusive);
-
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:	blockdevice to close
- * @mode:	mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-	bd_release(bdev);
-	blkdev_put(bdev, mode);
-}
-
-EXPORT_SYMBOL(close_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e57..ecb9fd3be14 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
 	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32f..31610ea73ae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d..15b5ca2a260 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
-			if (IS_ERR(acl))
+			if (IS_ERR(acl)) {
+				kfree(value);
 				return acl;
+			}
 			set_cached_acl(inode, type, acl);
 		}
 		kfree(value);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca..ccc991c542d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
 	/*
 	 * always compress this one file
 	 */
-	unsigned force_compress:1;
+	unsigned force_compress:4;
 
 	struct inode vfs_inode;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c5..f745287fbf2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
 	/* number of bytes on disk */
 	unsigned long compressed_len;
 
+	/* the compression algorithm for this bio */
+	int compress_type;
+
 	/* number of compressed pages in the array */
 	unsigned long nr_pages;
 
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
-					cb->start,
-					cb->orig_bio->bi_io_vec,
-					cb->orig_bio->bi_vcnt,
-					cb->compressed_len);
+	ret = btrfs_decompress_biovec(cb->compress_type,
+				      cb->compressed_pages,
+				      cb->start,
+				      cb->orig_bio->bi_io_vec,
+				      cb->orig_bio->bi_vcnt,
+				      cb->compressed_len);
 csum_failed:
 	if (ret)
 		cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	cb->len = uncompressed_len;
 	cb->compressed_len = compressed_len;
+	cb->compress_type = extent_compress_type(bio_flags);
 	cb->orig_bio = bio;
 
 	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	bio_put(comp_bio);
 	return 0;
 }
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+struct btrfs_compress_op *btrfs_compress_op[] = {
+	&btrfs_zlib_compress,
+	&btrfs_lzo_compress,
+};
+
+int __init btrfs_init_compress(void)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		INIT_LIST_HEAD(&comp_idle_workspace[i]);
+		spin_lock_init(&comp_workspace_lock[i]);
+		atomic_set(&comp_alloc_workspace[i], 0);
+		init_waitqueue_head(&comp_workspace_wait[i]);
+	}
+	return 0;
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+	struct list_head *workspace;
+	int cpus = num_online_cpus();
+	int idx = type - 1;
+
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+again:
+	spin_lock(workspace_lock);
+	if (!list_empty(idle_workspace)) {
+		workspace = idle_workspace->next;
+		list_del(workspace);
+		(*num_workspace)--;
+		spin_unlock(workspace_lock);
+		return workspace;
+
+	}
+	if (atomic_read(alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+
+		spin_unlock(workspace_lock);
+		prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+			schedule();
+		finish_wait(workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(alloc_workspace);
+	spin_unlock(workspace_lock);
+
+	workspace = btrfs_compress_op[idx]->alloc_workspace();
+	if (IS_ERR(workspace)) {
+		atomic_dec(alloc_workspace);
+		wake_up(workspace_wait);
+	}
+	return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+	int idx = type - 1;
+	struct list_head *idle_workspace	= &comp_idle_workspace[idx];
+	spinlock_t *workspace_lock		= &comp_workspace_lock[idx];
+	atomic_t *alloc_workspace		= &comp_alloc_workspace[idx];
+	wait_queue_head_t *workspace_wait	= &comp_workspace_wait[idx];
+	int *num_workspace			= &comp_num_workspace[idx];
+
+	spin_lock(workspace_lock);
+	if (*num_workspace < num_online_cpus()) {
+		list_add_tail(workspace, idle_workspace);
+		(*num_workspace)++;
+		spin_unlock(workspace_lock);
+		goto wake;
+	}
+	spin_unlock(workspace_lock);
+
+	btrfs_compress_op[idx]->free_workspace(workspace);
+	atomic_dec(alloc_workspace);
+wake:
+	if (waitqueue_active(workspace_wait))
+		wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct list_head *workspace;
+	int i;
+
+	for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+		while (!list_empty(&comp_idle_workspace[i])) {
+			workspace = comp_idle_workspace[i].next;
+			list_del(workspace);
+			btrfs_compress_op[i]->free_workspace(workspace);
+			atomic_dec(&comp_alloc_workspace[i]);
+		}
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -1;
+
+	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+						      start, len, pages,
+						      nr_dest_pages, out_pages,
+						      total_in, total_out,
+						      max_out);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+			    struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -ENOMEM;
+
+	ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+							 disk_start,
+							 bvec, vcnt, srclen);
+	free_workspace(type, workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+	struct list_head *workspace;
+	int ret;
+
+	workspace = find_workspace(type);
+	if (IS_ERR(workspace))
+		return -ENOMEM;
+
+	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+						  dest_page, start_byte,
+						  srclen, destlen);
+
+	free_workspace(type, workspace);
+	return ret;
+}
+
+void __exit btrfs_exit_compress(void)
+{
+	free_workspaces();
+}
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *page_index,
+			      unsigned long *pg_offset)
+{
+	unsigned long buf_offset;
+	unsigned long current_buf_start;
+	unsigned long start_byte;
+	unsigned long working_bytes = total_out - buf_start;
+	unsigned long bytes;
+	char *kaddr;
+	struct page *page_out = bvec[*page_index].bv_page;
+
+	/*
+	 * start byte is the first byte of the page we're currently
+	 * copying into relative to the start of the compressed data.
+	 */
+	start_byte = page_offset(page_out) - disk_start;
+
+	/* we haven't yet hit data corresponding to this page */
+	if (total_out <= start_byte)
+		return 1;
+
+	/*
+	 * the start of the data we care about is offset into
+	 * the middle of our working buffer
+	 */
+	if (total_out > start_byte && buf_start < start_byte) {
+		buf_offset = start_byte - buf_start;
+		working_bytes -= buf_offset;
+	} else {
+		buf_offset = 0;
+	}
+	current_buf_start = buf_start;
+
+	/* copy bytes from the working buffer into the pages */
+	while (working_bytes > 0) {
+		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, working_bytes);
+		kaddr = kmap_atomic(page_out, KM_USER0);
+		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page_out);
+
+		*pg_offset += bytes;
+		buf_offset += bytes;
+		working_bytes -= bytes;
+		current_buf_start += bytes;
+
+		/* check if we need to pick another page */
+		if (*pg_offset == PAGE_CACHE_SIZE) {
+			(*page_index)++;
+			if (*page_index >= vcnt)
+				return 0;
+
+			page_out = bvec[*page_index].bv_page;
+			*pg_offset = 0;
+			start_byte = page_offset(page_out) - disk_start;
+
+			/*
+			 * make sure our new page is covered by this
+			 * working buffer
+			 */
+			if (total_out <= start_byte)
+				return 1;
+
+			/*
+			 * the next page in the biovec might not be adjacent
+			 * to the last page, but it might still be found
+			 * inside this working buffer. bump our offset pointer
+			 */
+			if (total_out > start_byte &&
+			    current_buf_start < start_byte) {
+				buf_offset = start_byte - buf_start;
+				working_bytes = total_out - start_byte;
+				current_buf_start = buf_start + buf_offset;
+			}
+		}
+	}
+
+	return 1;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa71..51000174b9d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
 
-int btrfs_zlib_decompress(unsigned char *data_in,
-			  struct page *dest_page,
-			  unsigned long start_byte,
-			  size_t srclen, size_t destlen);
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-			      u64 start, unsigned long len,
-			      struct page **pages,
-			      unsigned long nr_dest_pages,
-			      unsigned long *out_pages,
-			      unsigned long *total_in,
-			      unsigned long *total_out,
-			      unsigned long max_out);
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-			      u64 disk_start,
-			      struct bio_vec *bvec,
-			      int vcnt,
-			      size_t srclen);
-void btrfs_zlib_exit(void);
+int btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+			 u64 start, unsigned long len,
+			 struct page **pages,
+			 unsigned long nr_dest_pages,
+			 unsigned long *out_pages,
+			 unsigned long *total_in,
+			 unsigned long *total_out,
+			 unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+			    struct bio_vec *bvec, int vcnt, size_t srclen);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+		     unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+			      unsigned long total_out, u64 disk_start,
+			      struct bio_vec *bvec, int vcnt,
+			      unsigned long *page_index,
+			      unsigned long *pg_offset);
+
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long len, u64 disk_start,
 				  unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
+
+struct btrfs_compress_op {
+	struct list_head *(*alloc_workspace)(void);
+
+	void (*free_workspace)(struct list_head *workspace);
+
+	int (*compress_pages)(struct list_head *workspace,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+
+	int (*decompress_biovec)(struct list_head *workspace,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen);
+
+	int (*decompress)(struct list_head *workspace,
+			  unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+};
+
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
+
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac17159925..b5baff0dccf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
+	if (!p)
+		return;
 	btrfs_release_path(NULL, p);
 	kmem_cache_free(btrfs_path_cachep, p);
 }
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_assert_tree_locked(path->nodes[1]);
 
 	right = read_node_slot(root, upper, slot + 1);
+	if (right == NULL)
+		return 1;
+
 	btrfs_tree_lock(right);
 	btrfs_set_lock_blocking(right);
 
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_assert_tree_locked(path->nodes[1]);
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
+	if (left == NULL)
+		return 1;
+
 	btrfs_tree_lock(left);
 	btrfs_set_lock_blocking(left);
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a142d204b52..2c98b3af605 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/kobject.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
+
+/*
+ * File system states
+ */
+
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
+
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
 
@@ -398,13 +407,15 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
-	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
 } __attribute__ ((__packed__));
 
 enum btrfs_compression_type {
-	BTRFS_COMPRESS_NONE = 0,
-	BTRFS_COMPRESS_ZLIB = 1,
-	BTRFS_COMPRESS_LAST = 2,
+	BTRFS_COMPRESS_NONE  = 0,
+	BTRFS_COMPRESS_ZLIB  = 1,
+	BTRFS_COMPRESS_LZO   = 2,
+	BTRFS_COMPRESS_TYPES = 2,
+	BTRFS_COMPRESS_LAST  = 3,
 };
 
 struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
 	u8 type;
 } __attribute__ ((__packed__));
 
+#define BTRFS_ROOT_SUBVOL_RDONLY	(1ULL << 0)
+
 struct btrfs_root_item {
 	struct btrfs_inode_item inode;
 	__le64 generation;
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
 	 */
 	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
-	unsigned long mount_opt;
+	unsigned long mount_opt:20;
+	unsigned long compress_type:4;
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
 	unsigned metadata_ratio;
 
 	void *bdev_holder;
+
+	/* filesystem state */
+	u64 fs_state;
 };
 
 /*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 			 last_snapshot, 64);
 
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
+
 /* struct btrfs_super_block */
 
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 int btrfs_set_block_group_rw(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+				   u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes);
+
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2541,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno);
+
+#define btrfs_std_error(fs_info, errno)				\
+do {								\
+	if ((errno))						\
+		__btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34e..b531c36455d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+				    int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
 
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	WARN_ON(len == 0);
 
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	if (eb == NULL) {
+		WARN_ON(1);
+		goto out;
+	}
 	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 					     btrfs_header_generation(eb));
 	BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	WARN_ON(len == 0);
 
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	if (eb == NULL) {
+		ret = -EIO;
+		goto out;
+	}
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	}
 	btrfs_free_path(path);
 	if (ret) {
+		kfree(root);
 		if (ret > 0)
 			ret = -ENOENT;
 		return ERR_PTR(ret);
@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-	if (!bh)
+	if (!bh) {
+		err = -EINVAL;
 		goto fail_iput;
+	}
 
 	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
 	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_iput;
 
+	/* check FS state, whether FS is broken. */
+	fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
 	ret = btrfs_parse_options(tree_root, options);
 	if (ret) {
 		err = ret;
@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	features = btrfs_super_incompat_flags(disk_super);
-	if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
-		features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-		btrfs_set_super_incompat_flags(disk_super, features);
-	}
+	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+	if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+	btrfs_set_super_incompat_flags(disk_super, features);
 
 	features = btrfs_super_compat_ro_flags(disk_super) &
 		~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
-	if (btrfs_super_log_root(disk_super) != 0) {
+	/* do not make disk changes in broken FS */
+	if (btrfs_super_log_root(disk_super) != 0 &&
+	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
 	smp_mb();
 
 	btrfs_put_block_group_cache(fs_info);
+
+	/*
+	 * Here come 2 situations when btrfs is broken to flip readonly:
+	 *
+	 * 1. when btrfs flips readonly somewhere else before
+	 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+	 * and btrfs will skip to write sb directly to keep
+	 * ERROR state on disk.
+	 *
+	 * 2. when btrfs flips readonly just in btrfs_commit_super,
+	 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+	 * btrfs will cleanup all FS resources first and write sb then.
+	 */
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-		ret =  btrfs_commit_super(root);
+		ret = btrfs_commit_super(root);
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+	}
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		ret = btrfs_error_commit_super(root);
 		if (ret)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
@@ -2619,6 +2671,352 @@ out:
 	return 0;
 }
 
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+			      int read_only)
+{
+	if (read_only)
+		return;
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		printk(KERN_WARNING "warning: mount fs with errors, "
+		       "running btrfsck is recommended\n");
+}
+
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+	int ret;
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
+	/* cleanup FS via transaction */
+	btrfs_cleanup_transaction(root);
+
+	ret = write_ctree_super(NULL, root, 0);
+
+	return ret;
+}
+
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->fs_info->ordered_extent_lock);
+
+	list_splice_init(&root->fs_info->ordered_operations, &splice);
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+					 ordered_operations);
+
+		list_del_init(&btrfs_inode->ordered_operations);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+	}
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return 0;
+}
+
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+	struct list_head splice;
+	struct btrfs_ordered_extent *ordered;
+	struct inode *inode;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+
+	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	while (!list_empty(&splice)) {
+		ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+				     root_extent_list);
+
+		list_del_init(&ordered->root_extent_list);
+		atomic_inc(&ordered->refs);
+
+		/* the inode may be getting freed (in sys_unlink path). */
+		inode = igrab(ordered->inode);
+
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+		if (inode)
+			iput(inode);
+
+		atomic_set(&ordered->refs, 1);
+		btrfs_put_ordered_extent(ordered);
+
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	int ret = 0;
+
+	delayed_refs = &trans->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	if (delayed_refs->num_entries == 0) {
+		printk(KERN_INFO "delayed_refs has NO entry\n");
+		return ret;
+	}
+
+	node = rb_first(&delayed_refs->root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
+
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
+
+		atomic_set(&ref->refs, 1);
+		if (btrfs_delayed_ref_is_head(ref)) {
+			struct btrfs_delayed_ref_head *head;
+
+			head = btrfs_delayed_node_to_head(ref);
+			mutex_lock(&head->mutex);
+			kfree(head->extent_op);
+			delayed_refs->num_heads--;
+			if (list_empty(&head->cluster))
+				delayed_refs->num_heads_ready--;
+			list_del_init(&head->cluster);
+			mutex_unlock(&head->mutex);
+		}
+
+		spin_unlock(&delayed_refs->lock);
+		btrfs_put_delayed_ref(ref);
+
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+
+	spin_unlock(&delayed_refs->lock);
+
+	return ret;
+}
+
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+	struct btrfs_pending_snapshot *snapshot;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	list_splice_init(&t->pending_snapshots, &splice);
+
+	while (!list_empty(&splice)) {
+		snapshot = list_entry(splice.next,
+				      struct btrfs_pending_snapshot,
+				      list);
+
+		list_del_init(&snapshot->list);
+
+		kfree(snapshot);
+	}
+
+	return 0;
+}
+
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+	spin_lock(&root->fs_info->delalloc_lock);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+				    delalloc_inodes);
+
+		list_del_init(&btrfs_inode->delalloc_inodes);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
+	}
+
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	return 0;
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages,
+					int mark)
+{
+	int ret;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+	u64 start = 0;
+	u64 end;
+	u64 offset;
+	unsigned long index;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark);
+		if (ret)
+			break;
+
+		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+		while (start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+			offset = page_offset(page);
+
+			spin_lock(&dirty_pages->buffer_lock);
+			eb = radix_tree_lookup(
+			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+					       offset >> PAGE_CACHE_SHIFT);
+			spin_unlock(&dirty_pages->buffer_lock);
+			if (eb) {
+				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+							 &eb->bflags);
+				atomic_set(&eb->refs, 1);
+			}
+			if (PageWriteback(page))
+				end_page_writeback(page);
+
+			lock_page(page);
+			if (PageDirty(page)) {
+				clear_page_dirty_for_io(page);
+				spin_lock_irq(&page->mapping->tree_lock);
+				radix_tree_tag_clear(&page->mapping->page_tree,
+							page_index(page),
+							PAGECACHE_TAG_DIRTY);
+				spin_unlock_irq(&page->mapping->tree_lock);
+			}
+
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+		}
+	}
+
+	return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+				       struct extent_io_tree *pinned_extents)
+{
+	struct extent_io_tree *unpin;
+	u64 start;
+	u64 end;
+	int ret;
+
+	unpin = pinned_extents;
+	while (1) {
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		/* opt_discard */
+		ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		btrfs_error_unpin_extent_range(root, start, end);
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *t;
+	LIST_HEAD(list);
+
+	WARN_ON(1);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+	list_splice_init(&root->fs_info->trans_list, &list);
+	while (!list_empty(&list)) {
+		t = list_entry(list.next, struct btrfs_transaction, list);
+		if (!t)
+			break;
+
+		btrfs_destroy_ordered_operations(root);
+
+		btrfs_destroy_ordered_extents(root);
+
+		btrfs_destroy_delayed_refs(t, root);
+
+		btrfs_block_rsv_release(root,
+					&root->fs_info->trans_block_rsv,
+					t->dirty_pages.dirty_bytes);
+
+		/* FIXME: cleanup wait for commit */
+		t->in_commit = 1;
+		t->blocked = 1;
+		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+			wake_up(&root->fs_info->transaction_blocked_wait);
+
+		t->blocked = 0;
+		if (waitqueue_active(&root->fs_info->transaction_wait))
+			wake_up(&root->fs_info->transaction_wait);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		t->commit_done = 1;
+		if (waitqueue_active(&t->commit_wait))
+			wake_up(&t->commit_wait);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+
+		btrfs_destroy_pending_snapshots(t);
+
+		btrfs_destroy_delalloc_inodes(root);
+
+		spin_lock(&root->fs_info->new_trans_lock);
+		root->fs_info->running_transaction = NULL;
+		spin_unlock(&root->fs_info->new_trans_lock);
+
+		btrfs_destroy_marked_extents(root, &t->dirty_pages,
+					     EXTENT_DIRTY);
+
+		btrfs_destroy_pinned_extent(root,
+					    root->fs_info->pinned_extents);
+
+		t->use_count = 0;
+		list_del_init(&t->list);
+		memset(t, 0, sizeof(*t));
+		kmem_cache_free(btrfs_transaction_cachep, t);
+	}
+
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	return 0;
+}
+
 static struct extent_io_ops btree_extent_io_ops = {
 	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf2..07b20dc2fd9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 0ccf9a8afcd..9786963b07e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
 	struct btrfs_root *root;
-	struct dentry *dentry;
 	struct inode *inode;
 	struct btrfs_key key;
 	int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 		return ERR_PTR(-ESTALE);
 	}
 
-	dentry = d_obtain_alias(inode);
-	if (!IS_ERR(dentry))
-		d_set_d_op(dentry, &btrfs_dentry_operations);
-	return dentry;
+	return d_obtain_alias(inode);
 fail:
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
 	return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
 	struct inode *dir = child->d_inode;
-	struct dentry *dentry;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -223,10 +218,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
-	if (!IS_ERR(dentry))
-		d_set_d_op(dentry, &btrfs_dentry_operations);
-	return dentry;
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
 fail:
 	btrfs_free_path(path);
 	return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d83..b55269340ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
 	u64 flags;
 
@@ -3161,8 +3161,12 @@ alloc:
 					     bytes + 2 * 1024 * 1024,
 					     alloc_target, 0);
 			btrfs_end_transaction(trans, root);
-			if (ret < 0)
-				return ret;
+			if (ret < 0) {
+				if (ret != -ENOSPC)
+					return ret;
+				else
+					goto commit_trans;
+			}
 
 			if (!data_sinfo) {
 				btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3177,7 @@ alloc:
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
+commit_trans:
 		if (!committed && !root->fs_info->open_ioctl_trans) {
 			committed = 1;
 			trans = btrfs_join_transaction(root, 1);
@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	WARN_ON(1);
-	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-		block_rsv->size, block_rsv->reserved,
-		block_rsv->freed[0], block_rsv->freed[1]);
-
 	return -ENOSPC;
 }
 
@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
 	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+	    cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
 		sinfo->bytes_reserved += cache->reserved_pinned;
 		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
+
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
 	return ret;
@@ -8012,6 +8013,62 @@ out:
 	return ret;
 }
 
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 free_bytes = 0;
+	int factor;
+
+	list_for_each_entry(block_group, groups_list, list) {
+		spin_lock(&block_group->lock);
+
+		if (!block_group->ro) {
+			spin_unlock(&block_group->lock);
+			continue;
+		}
+
+		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					  BTRFS_BLOCK_GROUP_RAID10 |
+					  BTRFS_BLOCK_GROUP_DUP))
+			factor = 2;
+		else
+			factor = 1;
+
+		free_bytes += (block_group->key.offset -
+			       btrfs_block_group_used(&block_group->item)) *
+			       factor;
+
+		spin_unlock(&block_group->lock);
+	}
+
+	return free_bytes;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+	int i;
+	u64 free_bytes = 0;
+
+	spin_lock(&sinfo->lock);
+
+	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		if (!list_empty(&sinfo->block_groups[i]))
+			free_bytes += __btrfs_get_ro_block_group_free_space(
+						&sinfo->block_groups[i]);
+
+	spin_unlock(&sinfo->lock);
+
+	return free_bytes;
+}
+
 int btrfs_set_block_group_rw(struct btrfs_root *root,
 			      struct btrfs_block_group_cache *cache)
 {
@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	mutex_lock(&root->fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 min_free = btrfs_block_group_used(&block_group->item);
-		u64 dev_offset, max_avail;
+		u64 dev_offset;
 
 		/*
 		 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 */
 		if (device->total_bytes > device->bytes_used + min_free) {
 			ret = find_free_dev_extent(NULL, device, min_free,
-						   &dev_offset, &max_avail);
+						   &dev_offset, NULL);
 			if (!ret)
 				break;
 			ret = -1;
@@ -8584,3 +8641,14 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+	return unpin_extent_range(root, start, end);
+}
+
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes)
+{
+	return btrfs_discard_extent(root, bytenr, num_bytes);
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f3650..2e993cf1766 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 			this_bio_flag = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&this_bio_flag,
+						 em->compress_type);
+		}
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	if (eb == NULL)
+		return NULL;
 	eb->start = start;
 	eb->len = len;
 	spin_lock_init(&eb->lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f0..7083cfafd06 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
 	wait_queue_head_t lock_wq;
 };
 
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+					    int compress_type)
+{
+	*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+	return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
 struct extent_map_tree;
 
 static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff6..b0e1fce1253 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include "ctree.h"
 #include "extent_map.h"
 
 
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 		return em;
 	em->in_tree = 0;
 	em->flags = 0;
+	em->compress_type = BTRFS_COMPRESS_NONE;
 	atomic_set(&em->refs, 1);
 	return em;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e64..28b44dbd1e3 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-	int in_tree;
+	unsigned int in_tree:1;
+	unsigned int compress_type:4;
 };
 
 struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763..c800d58f301 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			split->bdev = em->bdev;
 			split->flags = flags;
+			split->compress_type = em->compress_type;
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret);
 			free_extent_map(split);
@@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->len = em->start + em->len - (start + len);
 			split->bdev = em->bdev;
 			split->flags = flags;
+			split->compress_type = em->compress_type;
 
 			if (compressed) {
 				split->block_len = em->block_len;
@@ -890,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	if (err)
 		goto out;
 
+	/*
+	 * If BTRFS flips readonly due to some impossible error
+	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+	 * although we have opened a file as writable, we have
+	 * to stop this write operation to ensure FS consistency.
+	 */
+	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		err = -EROFS;
+		goto out;
+	}
+
 	file_update_time(file);
 	BTRFS_I(inode)->sequence++;
 
@@ -1237,6 +1251,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
+static long btrfs_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct extent_state *cached_state = NULL;
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 locked_end;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, alloc_end);
+	if (ret)
+		goto out;
+
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		goto out;
+
+	locked_end = alloc_end - 1;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
+
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -1248,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
+	.fallocate	= btrfs_fallocate,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0ff46a4789..160b55b3e13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	size_t cur_size = size;
 	size_t datasize;
 	unsigned long offset;
-	int use_compress = 0;
+	int compress_type = BTRFS_COMPRESS_NONE;
 
 	if (compressed_size && compressed_pages) {
-		use_compress = 1;
+		compress_type = root->fs_info->compress_type;
 		cur_size = compressed_size;
 	}
 
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 	ptr = btrfs_file_extent_inline_start(ei);
 
-	if (use_compress) {
+	if (compress_type != BTRFS_COMPRESS_NONE) {
 		struct page *cpage;
 		int i = 0;
 		while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 			compressed_size -= cur_size;
 		}
 		btrfs_set_file_extent_compression(leaf, ei,
-						  BTRFS_COMPRESS_ZLIB);
+						  compress_type);
 	} else {
 		page = find_get_page(inode->i_mapping,
 				     start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
 	u64 compressed_size;
 	struct page **pages;
 	unsigned long nr_pages;
+	int compress_type;
 	struct list_head list;
 };
 
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
 				     u64 start, u64 ram_size,
 				     u64 compressed_size,
 				     struct page **pages,
-				     unsigned long nr_pages)
+				     unsigned long nr_pages,
+				     int compress_type)
 {
 	struct async_extent *async_extent;
 
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
 	async_extent->compressed_size = compressed_size;
 	async_extent->pages = pages;
 	async_extent->nr_pages = nr_pages;
+	async_extent->compress_type = compress_type;
 	list_add_tail(&async_extent->list, &cow->extents);
 	return 0;
 }
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
 	unsigned long max_uncompressed = 128 * 1024;
 	int i;
 	int will_compress;
+	int compress_type = root->fs_info->compress_type;
 
 	actual_end = min_t(u64, isize, end + 1);
 again:
@@ -381,12 +385,16 @@ again:
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
-		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
-						total_compressed, pages,
-						nr_pages, &nr_pages_ret,
-						&total_in,
-						&total_compressed,
-						max_compressed);
+		if (BTRFS_I(inode)->force_compress)
+			compress_type = BTRFS_I(inode)->force_compress;
+
+		ret = btrfs_compress_pages(compress_type,
+					   inode->i_mapping, start,
+					   total_compressed, pages,
+					   nr_pages, &nr_pages_ret,
+					   &total_in,
+					   &total_compressed,
+					   max_compressed);
 
 		if (!ret) {
 			unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
 		 * and will submit them to the elevator.
 		 */
 		add_async_extent(async_cow, start, num_bytes,
-				 total_compressed, pages, nr_pages_ret);
+				 total_compressed, pages, nr_pages_ret,
+				 compress_type);
 
 		if (start + num_bytes < end) {
 			start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
 			__set_page_dirty_nobuffers(locked_page);
 			/* unlocked later on in the async handlers */
 		}
-		add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+		add_async_extent(async_cow, start, end - start + 1,
+				 0, NULL, 0, BTRFS_COMPRESS_NONE);
 		*num_added += 1;
 	}
 
@@ -640,6 +650,7 @@ retry:
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		em->compress_type = async_extent->compress_type;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
@@ -656,11 +667,13 @@ retry:
 						async_extent->ram_size - 1, 0);
 		}
 
-		ret = btrfs_add_ordered_extent(inode, async_extent->start,
-					       ins.objectid,
-					       async_extent->ram_size,
-					       ins.offset,
-					       BTRFS_ORDERED_COMPRESSED);
+		ret = btrfs_add_ordered_extent_compress(inode,
+						async_extent->start,
+						ins.objectid,
+						async_extent->ram_size,
+						ins.offset,
+						BTRFS_ORDERED_COMPRESSED,
+						async_extent->compress_type);
 		BUG_ON(ret);
 
 		/*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
-	int compressed = 0;
+	int compress_type = 0;
 	int ret;
 	bool nolock = false;
 
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-		compressed = 1;
+		compress_type = ordered_extent->compress_type;
 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-		BUG_ON(compressed);
+		BUG_ON(compress_type);
 		ret = btrfs_mark_extent_written(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->disk_len,
 						ordered_extent->len,
 						ordered_extent->len,
-						compressed, 0, 0,
+						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
 		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
 				   ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 			logical = em->block_start;
 			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
 		}
 		failrec->logical = logical;
 		free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int err;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	int index;
 	int ret;
 
-	d_set_d_op(dentry, &btrfs_dentry_operations);
-
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
@@ -4930,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	size_t max_size;
 	unsigned long inline_size;
 	unsigned long ptr;
+	int compress_type;
 
 	WARN_ON(pg_offset != 0);
+	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 	inline_size = btrfs_file_extent_inline_item_len(leaf,
 					btrfs_item_nr(leaf, path->slots[0]));
@@ -4941,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
 
 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
-				    inline_size, max_size);
+	ret = btrfs_decompress(compress_type, tmp, page,
+			       extent_offset, inline_size, max_size);
 	if (ret) {
 		char *kaddr = kmap_atomic(page, KM_USER0);
 		unsigned long copy_size = min_t(u64,
@@ -4984,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
-	int compressed;
+	int compress_type;
 
 again:
 	read_lock(&em_tree->lock);
@@ -5043,7 +5062,7 @@ again:
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
-	compressed = btrfs_file_extent_compression(leaf, item);
+	compress_type = btrfs_file_extent_compression(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		extent_end = extent_start +
@@ -5089,8 +5108,9 @@ again:
 			em->block_start = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		if (compressed) {
+		if (compress_type != BTRFS_COMPRESS_NONE) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
 			em->block_start = bytenr;
 			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
 									 item);
@@ -5124,12 +5144,14 @@ again:
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
 		em->orig_start = EXTENT_MAP_INLINE;
-		if (compressed)
+		if (compress_type) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+		}
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
-			if (btrfs_file_extent_compression(leaf, item) ==
-			    BTRFS_COMPRESS_ZLIB) {
+			if (btrfs_file_extent_compression(leaf, item) !=
+			    BTRFS_COMPRESS_NONE) {
 				ret = uncompress_inline(path, inode, page,
 							pg_offset,
 							extent_offset, item);
@@ -6479,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->ordered_data_close = 0;
 	ei->orphan_meta_reserved = 0;
 	ei->dummy_inode = 0;
-	ei->force_compress = 0;
+	ei->force_compress = BTRFS_COMPRESS_NONE;
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7100,112 +7122,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 					   min_size, actual_len, alloc_hint, trans);
 }
 
-static long btrfs_fallocate(struct inode *inode, int mode,
-			    loff_t offset, loff_t len)
-{
-	struct extent_state *cached_state = NULL;
-	u64 cur_offset;
-	u64 last_byte;
-	u64 alloc_start;
-	u64 alloc_end;
-	u64 alloc_hint = 0;
-	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-	struct extent_map *em;
-	int ret;
-
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
-
-	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
-	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
-	mutex_lock(&inode->i_mutex);
-	ret = inode_newsize_ok(inode, alloc_end);
-	if (ret)
-		goto out;
-
-	if (alloc_start > inode->i_size) {
-		ret = btrfs_cont_expand(inode, alloc_start);
-		if (ret)
-			goto out;
-	}
-
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-	if (ret)
-		goto out;
-
-	locked_end = alloc_end - 1;
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		/* the extent lock is ordered inside the running
-		 * transaction
-		 */
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    alloc_end - 1);
-		if (ordered &&
-		    ordered->file_offset + ordered->len > alloc_start &&
-		    ordered->file_offset < alloc_end) {
-			btrfs_put_ordered_extent(ordered);
-			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     alloc_start, locked_end,
-					     &cached_state, GFP_NOFS);
-			/*
-			 * we can't wait on the range with the transaction
-			 * running or with the extent lock held
-			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
-		} else {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-	}
-
-	cur_offset = alloc_start;
-	while (1) {
-		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-				      alloc_end - cur_offset, 0);
-		BUG_ON(IS_ERR(em) || !em);
-		last_byte = min(extent_map_end(em), alloc_end);
-		last_byte = (last_byte + mask) & ~mask;
-		if (em->block_start == EXTENT_MAP_HOLE ||
-		    (cur_offset >= inode->i_size &&
-		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-							last_byte - cur_offset,
-							1 << inode->i_blkbits,
-							offset + len,
-							&alloc_hint);
-			if (ret < 0) {
-				free_extent_map(em);
-				break;
-			}
-		}
-		free_extent_map(em);
-
-		cur_offset = last_byte;
-		if (cur_offset >= alloc_end) {
-			ret = 0;
-			break;
-		}
-	}
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
 static int btrfs_set_page_dirty(struct page *page)
 {
 	return __set_page_dirty_nobuffers(page);
@@ -7213,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+		return -EROFS;
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
 	return generic_permission(inode, mask, flags, btrfs_check_acl);
@@ -7308,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
-	.fallocate	= btrfs_fallocate,
 	.fiemap		= btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7e..a506a22b522 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	unsigned int flags, oldflags;
 	int ret;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	if (copy_from_user(&flags, arg, sizeof(flags)))
 		return -EFAULT;
 
@@ -360,7 +363,8 @@ fail:
 }
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-			   char *name, int namelen, u64 *async_transid)
+			   char *name, int namelen, u64 *async_transid,
+			   bool readonly)
 {
 	struct inode *inode;
 	struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
+	pending_snapshot->readonly = readonly;
 
 	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
 	if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
-				   u64 *async_transid)
+				   u64 *async_transid, bool readonly)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
 
 	if (snap_src) {
 		error = create_snapshot(snap_src, dentry,
-					name, namelen, async_transid);
+					name, namelen, async_transid, readonly);
 	} else {
 		error = create_subvol(BTRFS_I(dir)->root, dentry,
 				      name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
 	struct page *page;
+	struct btrfs_super_block *disk_super;
 	unsigned long last_index;
 	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
 	unsigned long total_read = 0;
+	u64 features;
 	u64 page_start;
 	u64 page_end;
 	u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
 	u64 defrag_end = 0;
 	unsigned long i;
 	int ret;
+	int compress_type = BTRFS_COMPRESS_ZLIB;
+
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		if (range->compress_type > BTRFS_COMPRESS_TYPES)
+			return -EINVAL;
+		if (range->compress_type)
+			compress_type = range->compress_type;
+	}
 
 	if (inode->i_size == 0)
 		return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
 		total_read++;
 		mutex_lock(&inode->i_mutex);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = 1;
+			BTRFS_I(inode)->force_compress = compress_type;
 
 		ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 		if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
 		atomic_dec(&root->fs_info->async_submit_draining);
 
 		mutex_lock(&inode->i_mutex);
-		BTRFS_I(inode)->force_compress = 0;
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
 		mutex_unlock(&inode->i_mutex);
 	}
 
+	disk_super = &root->fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (range->compress_type == BTRFS_COMPRESS_LZO) {
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+
 	return 0;
 
 err_reservations:
@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 						    char *name,
 						    unsigned long fd,
 						    int subvol,
-						    u64 *transid)
+						    u64 *transid,
+						    bool readonly)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct file *src_file;
@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 
 	if (subvol) {
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     NULL, transid);
+				     NULL, transid, readonly);
 	} else {
 		struct inode *src_inode;
 		src_file = fget(fd);
@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 		}
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
 				     BTRFS_I(src_inode)->root,
-				     transid);
+				     transid, readonly);
 		fput(src_file);
 	}
 out:
@@ -946,58 +969,139 @@ out:
 }
 
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-					    void __user *arg, int subvol,
-					    int v2)
+					    void __user *arg, int subvol)
 {
-	struct btrfs_ioctl_vol_args *vol_args = NULL;
-	struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
-	char *name;
-	u64 fd;
+	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
-	if (v2) {
-		u64 transid = 0;
-		u64 *ptr = NULL;
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-		vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
-		if (IS_ERR(vol_args_v2))
-			return PTR_ERR(vol_args_v2);
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol,
+					      NULL, false);
 
-		if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		name = vol_args_v2->name;
-		fd = vol_args_v2->fd;
-		vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+	kfree(vol_args);
+	return ret;
+}
 
-		if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
-			ptr = &transid;
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+					       void __user *arg, int subvol)
+{
+	struct btrfs_ioctl_vol_args_v2 *vol_args;
+	int ret;
+	u64 transid = 0;
+	u64 *ptr = NULL;
+	bool readonly = false;
 
-		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-						      subvol, ptr);
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
+	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
-		if (ret == 0 && ptr &&
-		    copy_to_user(arg +
-				 offsetof(struct btrfs_ioctl_vol_args_v2,
-					  transid), ptr, sizeof(*ptr)))
-			ret = -EFAULT;
-	} else {
-		vol_args = memdup_user(arg, sizeof(*vol_args));
-		if (IS_ERR(vol_args))
-			return PTR_ERR(vol_args);
-		name = vol_args->name;
-		fd = vol_args->fd;
-		vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
-		ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-						      subvol, NULL);
+	if (vol_args->flags &
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+		ret = -EOPNOTSUPP;
+		goto out;
 	}
+
+	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+		ptr = &transid;
+	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+		readonly = true;
+
+	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+					      vol_args->fd, subvol,
+					      ptr, readonly);
+
+	if (ret == 0 && ptr &&
+	    copy_to_user(arg +
+			 offsetof(struct btrfs_ioctl_vol_args_v2,
+				  transid), ptr, sizeof(*ptr)))
+		ret = -EFAULT;
 out:
 	kfree(vol_args);
-	kfree(vol_args_v2);
+	return ret;
+}
 
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+						void __user *arg)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	u64 flags = 0;
+
+	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	down_read(&root->fs_info->subvol_sem);
+	if (btrfs_root_readonly(root))
+		flags |= BTRFS_SUBVOL_RDONLY;
+	up_read(&root->fs_info->subvol_sem);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+					      void __user *arg)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 root_flags;
+	u64 flags;
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+		return -EINVAL;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+		return -EINVAL;
+
+	if (flags & ~BTRFS_SUBVOL_RDONLY)
+		return -EOPNOTSUPP;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	/* nothing to do */
+	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+		goto out;
+
+	root_flags = btrfs_root_flags(&root->root_item);
+	if (flags & BTRFS_SUBVOL_RDONLY)
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+	else
+		btrfs_set_root_flags(&root->root_item,
+				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_reset;
+	}
+
+	ret = btrfs_update_root(trans, root,
+				&root->root_key, &root->root_item);
+
+	btrfs_commit_transaction(trans, root);
+out_reset:
+	if (ret)
+		btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+	up_write(&root->fs_info->subvol_sem);
 	return ret;
 }
 
@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		return ret;
@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
 		return -EINVAL;
 
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		return ret;
@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	if (file->private_data)
 		goto out;
 
+	ret = -EROFS;
+	if (btrfs_root_readonly(root))
+		goto out;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		goto out;
@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case FS_IOC_GETVERSION:
 		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 0, 0);
+		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SNAP_CREATE_V2:
-		return btrfs_ioctl_snap_create(file, argp, 0, 1);
+		return btrfs_ioctl_snap_create_v2(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 1, 0);
+		return btrfs_ioctl_snap_create(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
+	case BTRFS_IOC_SUBVOL_GETFLAGS:
+		return btrfs_ioctl_subvol_getflags(file, argp);
+	case BTRFS_IOC_SUBVOL_SETFLAGS:
+		return btrfs_ioctl_subvol_setflags(file, argp);
 	case BTRFS_IOC_DEFAULT_SUBVOL:
 		return btrfs_ioctl_default_subvol(file, argp);
 	case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646..8fb382167b1 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
 };
 
 #define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
 
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
 	 */
 	__u32 extent_thresh;
 
+	/*
+	 * which compression method to use if turning on compression
+	 * for this defrag operation.  If unspecified, zlib will
+	 * be used
+	 */
+	__u32 compress_type;
+
 	/* spare for later */
-	__u32 unused[5];
+	__u32 unused[4];
 };
 
 struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 00000000000..cc9b450399d
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN	4
+
+struct workspace {
+	void *mem;
+	void *buf;	/* where compressed data goes */
+	void *cbuf;	/* where decompressed data goes */
+	struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	vfree(workspace->buf);
+	vfree(workspace->cbuf);
+	vfree(workspace->mem);
+	kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+	workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	lzo_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+	__le32 dlen;
+
+	dlen = cpu_to_le32(len);
+	memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+	__le32 dlen;
+
+	memcpy(&dlen, buf, LZO_LEN);
+	return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+			      struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	unsigned long bytes_left;
+
+	size_t in_len;
+	size_t out_len;
+	char *buf;
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long pg_bytes_left;
+	unsigned long out_offset;
+	unsigned long bytes;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	/*
+	 * store the size of all chunks of compressed data in
+	 * the first 4 bytes
+	 */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	cpage_out = kmap(out_page);
+	out_offset = LZO_LEN;
+	tot_out = LZO_LEN;
+	pages[0] = out_page;
+	nr_pages = 1;
+	pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	/* compress at most one page of data each time */
+	in_len = min(len, PAGE_CACHE_SIZE);
+	while (tot_in < len) {
+		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+				       &out_len, workspace->mem);
+		if (ret != LZO_E_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			ret = -1;
+			goto out;
+		}
+
+		/* store the size of this chunk of compressed data */
+		write_compress_length(cpage_out + out_offset, out_len);
+		tot_out += LZO_LEN;
+		out_offset += LZO_LEN;
+		pg_bytes_left -= LZO_LEN;
+
+		tot_in += in_len;
+		tot_out += out_len;
+
+		/* copy bytes from the working buffer into the pages */
+		buf = workspace->cbuf;
+		while (out_len) {
+			bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+			memcpy(cpage_out + out_offset, buf, bytes);
+
+			out_len -= bytes;
+			pg_bytes_left -= bytes;
+			buf += bytes;
+			out_offset += bytes;
+
+			/*
+			 * we need another page for writing out.
+			 *
+			 * Note if there's less than 4 bytes left, we just
+			 * skip to a new page.
+			 */
+			if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+			    pg_bytes_left == 0) {
+				if (pg_bytes_left) {
+					memset(cpage_out + out_offset, 0,
+					       pg_bytes_left);
+					tot_out += pg_bytes_left;
+				}
+
+				/* we're done, don't allocate new page */
+				if (out_len == 0 && tot_in >= len)
+					break;
+
+				kunmap(out_page);
+				if (nr_pages == nr_dest_pages) {
+					out_page = NULL;
+					ret = -1;
+					goto out;
+				}
+
+				out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+				if (out_page == NULL) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				cpage_out = kmap(out_page);
+				pages[nr_pages++] = out_page;
+
+				pg_bytes_left = PAGE_CACHE_SIZE;
+				out_offset = 0;
+			}
+		}
+
+		/* we're making it bigger, give up */
+		if (tot_in > 8192 && tot_in < tot_out)
+			goto out;
+
+		/* we're all done */
+		if (tot_in >= len)
+			break;
+
+		if (tot_out > max_out)
+			break;
+
+		bytes_left = len - tot_in;
+		kunmap(in_page);
+		page_cache_release(in_page);
+
+		start += PAGE_CACHE_SIZE;
+		in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		data_in = kmap(in_page);
+		in_len = min(bytes_left, PAGE_CACHE_SIZE);
+	}
+
+	if (tot_out > tot_in)
+		goto out;
+
+	/* store the size of all chunks of compressed data */
+	cpage_out = kmap(pages[0]);
+	write_compress_length(cpage_out, tot_out);
+
+	kunmap(pages[0]);
+
+	ret = 0;
+	*total_out = tot_out;
+	*total_in = tot_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+
+	return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+				 struct page **pages_in,
+				 u64 disk_start,
+				 struct bio_vec *bvec,
+				 int vcnt,
+				 size_t srclen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0, ret2;
+	char *data_in;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset = 0;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+
+	size_t in_len;
+	size_t out_len;
+	unsigned long in_offset;
+	unsigned long in_page_bytes_left;
+	unsigned long tot_in;
+	unsigned long tot_out;
+	unsigned long tot_len;
+	char *buf;
+
+	data_in = kmap(pages_in[0]);
+	tot_len = read_compress_length(data_in);
+
+	tot_in = LZO_LEN;
+	in_offset = LZO_LEN;
+	tot_len = min_t(size_t, srclen, tot_len);
+	in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+	tot_out = 0;
+	pg_offset = 0;
+
+	while (tot_in < tot_len) {
+		in_len = read_compress_length(data_in + in_offset);
+		in_page_bytes_left -= LZO_LEN;
+		in_offset += LZO_LEN;
+		tot_in += LZO_LEN;
+
+		tot_in += in_len;
+		working_bytes = in_len;
+
+		/* fast path: avoid using the working buffer */
+		if (in_page_bytes_left >= in_len) {
+			buf = data_in + in_offset;
+			bytes = in_len;
+			goto cont;
+		}
+
+		/* copy bytes from the pages into the working buffer */
+		buf = workspace->cbuf;
+		buf_offset = 0;
+		while (working_bytes) {
+			bytes = min(working_bytes, in_page_bytes_left);
+
+			memcpy(buf + buf_offset, data_in + in_offset, bytes);
+			buf_offset += bytes;
+cont:
+			working_bytes -= bytes;
+			in_page_bytes_left -= bytes;
+			in_offset += bytes;
+
+			/* check if we need to pick another page */
+			if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+			    || in_page_bytes_left == 0) {
+				tot_in += in_page_bytes_left;
+
+				if (working_bytes == 0 && tot_in >= tot_len)
+					break;
+
+				kunmap(pages_in[page_in_index]);
+				page_in_index++;
+				if (page_in_index >= total_pages_in) {
+					ret = -1;
+					data_in = NULL;
+					goto done;
+				}
+				data_in = kmap(pages_in[page_in_index]);
+
+				in_page_bytes_left = PAGE_CACHE_SIZE;
+				in_offset = 0;
+			}
+		}
+
+		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+					    &out_len);
+		if (ret != LZO_E_OK) {
+			printk(KERN_WARNING "btrfs decompress failed\n");
+			ret = -1;
+			break;
+		}
+
+		buf_start = tot_out;
+		tot_out += out_len;
+
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 tot_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0)
+			break;
+	}
+done:
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	size_t in_len;
+	size_t out_len;
+	size_t tot_len;
+	int ret = 0;
+	char *kaddr;
+	unsigned long bytes;
+
+	BUG_ON(srclen < LZO_LEN);
+
+	tot_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	in_len = read_compress_length(data_in);
+	data_in += LZO_LEN;
+
+	out_len = PAGE_CACHE_SIZE;
+	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+	if (ret != LZO_E_OK) {
+		printk(KERN_WARNING "btrfs decompress failed!\n");
+		ret = -1;
+		goto out;
+	}
+
+	if (out_len < start_byte) {
+		ret = -1;
+		goto out;
+	}
+
+	bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+	kaddr = kmap_atomic(dest_page, KM_USER0);
+	memcpy(kaddr, workspace->buf + start_byte, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+out:
+	return ret;
+}
+
+struct btrfs_compress_op btrfs_lzo_compress = {
+	.alloc_workspace	= lzo_alloc_workspace,
+	.free_workspace		= lzo_free_workspace,
+	.compress_pages		= lzo_compress_pages,
+	.decompress_biovec	= lzo_decompress_biovec,
+	.decompress		= lzo_decompress,
+};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c..2b61e1ddcd9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
-				      int type, int dio)
+				      int type, int dio, int compress_type)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
 	entry->inode = inode;
+	entry->compress_type = compress_type;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type)
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-					  disk_len, type, 0);
+					  disk_len, type, 0,
+					  BTRFS_COMPRESS_NONE);
 }
 
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 				 u64 start, u64 len, u64 disk_len, int type)
 {
 	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-					  disk_len, type, 1);
+					  disk_len, type, 1,
+					  BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0,
+					  compress_type);
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119d..ff1f69aa188 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
 
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
 	/* flags (described above) */
 	unsigned long flags;
 
+	/* compression algorithm */
+	int compress_type;
+
 	/* reference count */
 	atomic_t refs;
 
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 				 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int compress_type);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 883c6fa1367..b2130c46fdb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
 
 static const struct super_operations btrfs_super_ops;
 
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+				      char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		errstr = "Readonly filesystem";
+		break;
+	default:
+		if (nbuf) {
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * today we only save the error info into ram.  Long term we'll
+	 * also send it down to the disk
+	 */
+	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+
+/* NOTE:
+ *	We move write_super stuff at umount in order to avoid deadlock
+ *	for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+	__save_error_info(fs_info);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+	struct super_block *sb = fs_info->sb;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_INFO "btrfs is forced readonly\n");
+	}
+}
+
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno)
+{
+	struct super_block *sb = fs_info->sb;
+	char nbuf[16];
+	const char *errstr;
+
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = btrfs_decode_error(fs_info, errno, nbuf);
+	printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+		sb->s_id, function, line, errstr);
+	save_error_info(fs_info);
+
+	btrfs_handle_error(fs_info);
+}
+
 static void btrfs_put_super(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
 	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
 	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-	Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
-	Opt_user_subvol_rm_allowed,
+	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
+	{Opt_compress_type, "compress=%s"},
 	{Opt_compress_force, "compress-force"},
+	{Opt_compress_force_type, "compress-force=%s"},
 	{Opt_ssd, "ssd"},
 	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	char *p, *num, *orig;
 	int intarg;
 	int ret = 0;
+	char *compress_type;
+	bool compress_force = false;
 
 	if (!options)
 		return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
-		case Opt_compress:
-			printk(KERN_INFO "btrfs: use compression\n");
-			btrfs_set_opt(info->mount_opt, COMPRESS);
-			break;
 		case Opt_compress_force:
-			printk(KERN_INFO "btrfs: forcing compression\n");
-			btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+		case Opt_compress_force_type:
+			compress_force = true;
+		case Opt_compress:
+		case Opt_compress_type:
+			if (token == Opt_compress ||
+			    token == Opt_compress_force ||
+			    strcmp(args[0].from, "zlib") == 0) {
+				compress_type = "zlib";
+				info->compress_type = BTRFS_COMPRESS_ZLIB;
+			} else if (strcmp(args[0].from, "lzo") == 0) {
+				compress_type = "lzo";
+				info->compress_type = BTRFS_COMPRESS_LZO;
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+
 			btrfs_set_opt(info->mount_opt, COMPRESS);
+			if (compress_force) {
+				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+				pr_info("btrfs: force %s compression\n",
+					compress_type);
+			} else
+				pr_info("btrfs: use %s compression\n",
+					compress_type);
 			break;
 		case Opt_ssd:
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -460,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
 	sb->s_op = &btrfs_super_ops;
+	sb->s_d_op = &btrfs_dentry_operations;
 	sb->s_export_op = &btrfs_export_ops;
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
@@ -752,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_device_info *devices_info;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 skip_space;
+	u64 type;
+	u64 avail_space;
+	u64 used_space;
+	u64 min_stripe_size;
+	int min_stripes = 1;
+	int i = 0, nr_devices;
+	int ret;
+
+	nr_devices = fs_info->fs_devices->rw_devices;
+	BUG_ON(!nr_devices);
+
+	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	/* calc min stripe number for data space alloction */
+	type = btrfs_get_alloc_profile(root, 1);
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		min_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		min_stripes = 2;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		min_stripes = 4;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+	else
+		min_stripe_size = BTRFS_STRIPE_LEN;
+
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+		if (!device->in_fs_metadata)
+			continue;
+
+		avail_space = device->total_bytes - device->bytes_used;
+
+		/* align with stripe_len */
+		do_div(avail_space, BTRFS_STRIPE_LEN);
+		avail_space *= BTRFS_STRIPE_LEN;
+
+		/*
+		 * In order to avoid overwritting the superblock on the drive,
+		 * btrfs starts at an offset of at least 1MB when doing chunk
+		 * allocation.
+		 */
+		skip_space = 1024 * 1024;
+
+		/* user can set the offset in fs_info->alloc_start. */
+		if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+		    device->total_bytes)
+			skip_space = max(fs_info->alloc_start, skip_space);
+
+		/*
+		 * btrfs can not use the free space in [0, skip_space - 1],
+		 * we must subtract it from the total. In order to implement
+		 * it, we account the used space in this range first.
+		 */
+		ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+						     &used_space);
+		if (ret) {
+			kfree(devices_info);
+			return ret;
+		}
+
+		/* calc the free space in [0, skip_space - 1] */
+		skip_space -= used_space;
+
+		/*
+		 * we can use the free space in [0, skip_space - 1], subtract
+		 * it from the total.
+		 */
+		if (avail_space && avail_space >= skip_space)
+			avail_space -= skip_space;
+		else
+			avail_space = 0;
+
+		if (avail_space < min_stripe_size)
+			continue;
+
+		devices_info[i].dev = device;
+		devices_info[i].max_avail = avail_space;
+
+		i++;
+	}
+
+	nr_devices = i;
+
+	btrfs_descending_sort_devices(devices_info, nr_devices);
+
+	i = nr_devices - 1;
+	avail_space = 0;
+	while (nr_devices >= min_stripes) {
+		if (devices_info[i].max_avail >= min_stripe_size) {
+			int j;
+			u64 alloc_size;
+
+			avail_space += devices_info[i].max_avail * min_stripes;
+			alloc_size = devices_info[i].max_avail;
+			for (j = i + 1 - min_stripes; j <= i; j++)
+				devices_info[j].max_avail -= alloc_size;
+		}
+		i--;
+		nr_devices--;
+	}
+
+	kfree(devices_info);
+	*free_bytes = avail_space;
+	return 0;
+}
+
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -759,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
-	u64 total_used_data = 0;
+	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
 	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	int ret;
 
+	/* holding chunk_muext to avoid allocating new chunks */
+	mutex_lock(&root->fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
-				    BTRFS_BLOCK_GROUP_SYSTEM))
-			total_used_data += found->disk_total;
-		else
-			total_used_data += found->disk_used;
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+			total_free_data += found->disk_total - found->disk_used;
+			total_free_data -=
+				btrfs_account_ro_block_groups_free_space(found);
+		}
+
 		total_used += found->disk_used;
 	}
 	rcu_read_unlock();
@@ -777,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
 	buf->f_bfree = buf->f_blocks - (total_used >> bits);
-	buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
+	buf->f_bavail = total_free_data;
+	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	if (ret) {
+		mutex_unlock(&root->fs_info->chunk_mutex);
+		return ret;
+	}
+	buf->f_bavail += total_free_data;
+	buf->f_bavail = buf->f_bavail >> bits;
+	mutex_unlock(&root->fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
@@ -896,10 +1136,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		return err;
 
-	err = btrfs_init_cachep();
+	err = btrfs_init_compress();
 	if (err)
 		goto free_sysfs;
 
+	err = btrfs_init_cachep();
+	if (err)
+		goto free_compress;
+
 	err = extent_io_init();
 	if (err)
 		goto free_cachep;
@@ -927,6 +1171,8 @@ free_extent_io:
 	extent_io_exit();
 free_cachep:
 	btrfs_destroy_cachep();
+free_compress:
+	btrfs_exit_compress();
 free_sysfs:
 	btrfs_exit_sysfs();
 	return err;
@@ -941,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
-	btrfs_zlib_exit();
+	btrfs_exit_compress();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc21..bae5c7b8bbe 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	int ret;
+
+	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		return ERR_PTR(-EROFS);
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
+	u64 root_flags;
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
+	root_flags = btrfs_root_flags(new_root_item);
+	if (pending->readonly)
+		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+	else
+		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+	btrfs_set_root_flags(new_root_item, root_flags);
+
 	old = btrfs_lock_root_node(root);
 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 	btrfs_set_lock_blocking(old);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4e..229a594cacd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
 	struct btrfs_block_rsv block_rsv;
 	/* extra metadata reseration for relocation */
 	int error;
+	bool readonly;
 	struct list_head list;
 };
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6b988450783..d158530233b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/iocontext.h>
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -493,7 +494,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
+	flags |= FMODE_EXCL;
+
 	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_exclusive(device->name, flags, holder);
+		bdev = blkdev_get_by_path(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
@@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 
 		bh = btrfs_read_dev_super(bdev);
-		if (!bh)
+		if (!bh) {
+			ret = -EINVAL;
 			goto error_close;
+		}
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, FMODE_READ);
+		blkdev_put(bdev, flags);
 error:
 		continue;
 	}
@@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_exclusive(path, flags, holder);
+	flags |= FMODE_EXCL;
+	bdev = blkdev_get_by_path(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error_close;
 	bh = btrfs_read_dev_super(bdev);
 	if (!bh) {
-		ret = -EIO;
+		ret = -EINVAL;
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	close_bdev_exclusive(bdev, flags);
+	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent;
+	struct btrfs_path *path;
+	u64 extent_end;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+
+	*length = 0;
+
+	if (start >= device->total_bytes)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto out;
+	}
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			break;
+
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (key.offset <= start && extent_end > end) {
+			*length = end - start + 1;
+			break;
+		} else if (key.offset <= start && extent_end > start)
+			*length += extent_end - start;
+		else if (key.offset > start && extent_end <= end)
+			*length += extent_end - key.offset;
+		else if (key.offset > start && key.offset <= end) {
+			*length += end - key.offset + 1;
+			break;
+		} else if (key.offset > end)
+			break;
+
+next:
+		path->slots[0]++;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:	transaction handler
+ * @device:	the device which we search the free space in
+ * @num_bytes:	the size of the free space that we need
+ * @start:	store the start of the free space.
+ * @len:	the size of the free space. that we find, or the size of the max
+ * 		free space if we don't find suitable free space
+ *
  * this uses a pretty simple search, the expectation is that it is
  * called very infrequently and that a given device has a small number
  * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
  */
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
-			 u64 *start, u64 *max_avail)
+			 u64 *start, u64 *len)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
-	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_dev_extent *dev_extent;
 	struct btrfs_path *path;
-	u64 hole_size = 0;
-	u64 last_byte = 0;
-	u64 search_start = 0;
+	u64 hole_size;
+	u64 max_hole_start;
+	u64 max_hole_size;
+	u64 extent_end;
+	u64 search_start;
 	u64 search_end = device->total_bytes;
 	int ret;
-	int slot = 0;
-	int start_found;
+	int slot;
 	struct extent_buffer *l;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = 2;
-	start_found = 0;
-
 	/* FIXME use last free of some kind */
 
 	/* we don't want to overwrite the superblock on the drive,
 	 * so we make sure to start at an offset of at least 1MB
 	 */
-	search_start = max((u64)1024 * 1024, search_start);
+	search_start = 1024 * 1024;
 
-	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+	if (root->fs_info->alloc_start + num_bytes <= search_end)
 		search_start = max(root->fs_info->alloc_start, search_start);
 
+	max_hole_start = search_start;
+	max_hole_size = 0;
+
+	if (search_start >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	path->reada = 2;
+
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
+
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto error;
+		goto out;
 	if (ret > 0) {
 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
 		if (ret < 0)
-			goto error;
-		if (ret > 0)
-			start_found = 1;
+			goto out;
 	}
-	l = path->nodes[0];
-	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
 	while (1) {
 		l = path->nodes[0];
 		slot = path->slots[0];
@@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			if (ret == 0)
 				continue;
 			if (ret < 0)
-				goto error;
-no_more_items:
-			if (!start_found) {
-				if (search_start >= search_end) {
-					ret = -ENOSPC;
-					goto error;
-				}
-				*start = search_start;
-				start_found = 1;
-				goto check_pending;
-			}
-			*start = last_byte > search_start ?
-				last_byte : search_start;
-			if (search_end <= *start) {
-				ret = -ENOSPC;
-				goto error;
-			}
-			goto check_pending;
+				goto out;
+
+			break;
 		}
 		btrfs_item_key_to_cpu(l, &key, slot);
 
@@ -812,48 +911,62 @@ no_more_items:
 			goto next;
 
 		if (key.objectid > device->devid)
-			goto no_more_items;
+			break;
 
-		if (key.offset >= search_start && key.offset > last_byte &&
-		    start_found) {
-			if (last_byte < search_start)
-				last_byte = search_start;
-			hole_size = key.offset - last_byte;
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
 
-			if (hole_size > *max_avail)
-				*max_avail = hole_size;
+		if (key.offset > search_start) {
+			hole_size = key.offset - search_start;
 
-			if (key.offset > last_byte &&
-			    hole_size >= num_bytes) {
-				*start = last_byte;
-				goto check_pending;
+			if (hole_size > max_hole_size) {
+				max_hole_start = search_start;
+				max_hole_size = hole_size;
+			}
+
+			/*
+			 * If this free space is greater than which we need,
+			 * it must be the max free space that we have found
+			 * until now, so max_hole_start must point to the start
+			 * of this free space and the length of this free space
+			 * is stored in max_hole_size. Thus, we return
+			 * max_hole_start and max_hole_size and go back to the
+			 * caller.
+			 */
+			if (hole_size >= num_bytes) {
+				ret = 0;
+				goto out;
 			}
 		}
-		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-			goto next;
 
-		start_found = 1;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+		extent_end = key.offset + btrfs_dev_extent_length(l,
+								  dev_extent);
+		if (extent_end > search_start)
+			search_start = extent_end;
 next:
 		path->slots[0]++;
 		cond_resched();
 	}
-check_pending:
-	/* we have to make sure we didn't find an extent that has already
-	 * been allocated by the map tree or the original allocation
-	 */
-	BUG_ON(*start < search_start);
 
-	if (*start + num_bytes > search_end) {
-		ret = -ENOSPC;
-		goto error;
+	hole_size = search_end- search_start;
+	if (hole_size > max_hole_size) {
+		max_hole_start = search_start;
+		max_hole_size = hole_size;
 	}
-	/* check for pending inserts here */
-	ret = 0;
 
-error:
+	/* See above. */
+	if (hole_size < num_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+
+out:
 	btrfs_free_path(path);
+error:
+	*start = max_hole_start;
+	if (len)
+		*len = max_hole_size;
 	return ret;
 }
 
@@ -1183,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, FMODE_READ,
-				      root->fs_info->bdev_holder);
+		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+					  root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
@@ -1193,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		set_blocksize(bdev, 4096);
 		bh = btrfs_read_dev_super(bdev);
 		if (!bh) {
-			ret = -EIO;
+			ret = -EINVAL;
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1251,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		close_bdev_exclusive(device->bdev, device->mode);
+		blkdev_put(device->bdev, device->mode);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1294,7 +1407,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ);
+		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1446,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
@@ -1572,7 +1686,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_exclusive(bdev, 0);
+	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
@@ -1912,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
 
@@ -2150,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
 		return calc_size * num_stripes;
 }
 
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root,
-			       struct map_lookup **map_ret,
-			       u64 *num_bytes, u64 *stripe_size,
-			       u64 start, u64 type)
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
 {
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_device *device = NULL;
-	struct btrfs_fs_devices *fs_devices = info->fs_devices;
-	struct list_head *cur;
-	struct map_lookup *map = NULL;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
-	struct list_head private_devs;
-	int min_stripe_size = 1 * 1024 * 1024;
-	u64 calc_size = 1024 * 1024 * 1024;
-	u64 max_chunk_size = calc_size;
-	u64 min_free;
-	u64 avail;
-	u64 max_avail = 0;
-	u64 dev_offset;
-	int num_stripes = 1;
-	int min_stripes = 1;
-	int sub_stripes = 0;
-	int looped = 0;
-	int ret;
-	int index;
-	int stripe_len = 64 * 1024;
+	if (((struct btrfs_device_info *)dev_info1)->max_avail >
+	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return -1;
+	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return 1;
+	else
+		return 0;
+}
 
-	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (type & BTRFS_BLOCK_GROUP_DUP)) {
-		WARN_ON(1);
-		type &= ~BTRFS_BLOCK_GROUP_DUP;
-	}
-	if (list_empty(&fs_devices->alloc_list))
-		return -ENOSPC;
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+				 int *num_stripes, int *min_stripes,
+				 int *sub_stripes)
+{
+	*num_stripes = 1;
+	*min_stripes = 1;
+	*sub_stripes = 0;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = fs_devices->rw_devices;
-		min_stripes = 2;
+		*num_stripes = fs_devices->rw_devices;
+		*min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-		num_stripes = 2;
-		min_stripes = 2;
+		*num_stripes = 2;
+		*min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		if (fs_devices->rw_devices < 2)
 			return -ENOSPC;
-		num_stripes = 2;
-		min_stripes = 2;
+		*num_stripes = 2;
+		*min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = fs_devices->rw_devices;
-		if (num_stripes < 4)
+		*num_stripes = fs_devices->rw_devices;
+		if (*num_stripes < 4)
 			return -ENOSPC;
-		num_stripes &= ~(u32)1;
-		sub_stripes = 2;
-		min_stripes = 4;
+		*num_stripes &= ~(u32)1;
+		*sub_stripes = 2;
+		*min_stripes = 4;
 	}
 
+	return 0;
+}
+
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+				    u64 proposed_size, u64 type,
+				    int num_stripes, int small_stripe)
+{
+	int min_stripe_size = 1 * 1024 * 1024;
+	u64 calc_size = proposed_size;
+	u64 max_chunk_size = calc_size;
+	int ncopies = 1;
+
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+		    BTRFS_BLOCK_GROUP_DUP |
+		    BTRFS_BLOCK_GROUP_RAID10))
+		ncopies = 2;
+
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_chunk_size = 10 * calc_size;
 		min_stripe_size = 64 * 1024 * 1024;
@@ -2226,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
-again:
-	max_avail = 0;
-	if (!map || map->num_stripes != num_stripes) {
-		kfree(map);
-		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-		if (!map)
-			return -ENOMEM;
-		map->num_stripes = num_stripes;
-	}
-
-	if (calc_size * num_stripes > max_chunk_size) {
-		calc_size = max_chunk_size;
+	if (calc_size * num_stripes > max_chunk_size * ncopies) {
+		calc_size = max_chunk_size * ncopies;
 		do_div(calc_size, num_stripes);
-		do_div(calc_size, stripe_len);
-		calc_size *= stripe_len;
+		do_div(calc_size, BTRFS_STRIPE_LEN);
+		calc_size *= BTRFS_STRIPE_LEN;
 	}
 
 	/* we don't want tiny stripes */
-	if (!looped)
+	if (!small_stripe)
 		calc_size = max_t(u64, min_stripe_size, calc_size);
 
 	/*
-	 * we're about to do_div by the stripe_len so lets make sure
+	 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
 	 * we end up with something bigger than a stripe
 	 */
-	calc_size = max_t(u64, calc_size, stripe_len * 4);
+	calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+
+	do_div(calc_size, BTRFS_STRIPE_LEN);
+	calc_size *= BTRFS_STRIPE_LEN;
+
+	return calc_size;
+}
+
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+						      int num_stripes)
+{
+	struct map_lookup *new;
+	size_t len = map_lookup_size(num_stripes);
+
+	BUG_ON(map->num_stripes < num_stripes);
+
+	if (map->num_stripes == num_stripes)
+		return map;
+
+	new = kmalloc(len, GFP_NOFS);
+	if (!new) {
+		/* just change map->num_stripes */
+		map->num_stripes = num_stripes;
+		return map;
+	}
+
+	memcpy(new, map, len);
+	new->num_stripes = num_stripes;
+	kfree(map);
+	return new;
+}
+
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_devices *fs_devices,
+				    struct btrfs_device_info *devices,
+				    int nr_device, u64 type,
+				    struct map_lookup **map_lookup,
+				    int min_stripes, u64 *stripe_size)
+{
+	int i, index, sort_again = 0;
+	int min_devices = min_stripes;
+	u64 max_avail, min_free;
+	struct map_lookup *map = *map_lookup;
+	int ret;
+
+	if (nr_device < min_stripes)
+		return -ENOSPC;
+
+	btrfs_descending_sort_devices(devices, nr_device);
 
-	do_div(calc_size, stripe_len);
-	calc_size *= stripe_len;
+	max_avail = devices[0].max_avail;
+	if (!max_avail)
+		return -ENOSPC;
+
+	for (i = 0; i < nr_device; i++) {
+		/*
+		 * if dev_offset = 0, it means the free space of this device
+		 * is less than what we need, and we didn't search max avail
+		 * extent on this device, so do it now.
+		 */
+		if (!devices[i].dev_offset) {
+			ret = find_free_dev_extent(trans, devices[i].dev,
+						   max_avail,
+						   &devices[i].dev_offset,
+						   &devices[i].max_avail);
+			if (ret != 0 && ret != -ENOSPC)
+				return ret;
+			sort_again = 1;
+		}
+	}
+
+	/* we update the max avail free extent of each devices, sort again */
+	if (sort_again)
+		btrfs_descending_sort_devices(devices, nr_device);
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_devices = 1;
+
+	if (!devices[min_devices - 1].max_avail)
+		return -ENOSPC;
+
+	max_avail = devices[min_devices - 1].max_avail;
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		do_div(max_avail, 2);
+
+	max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+					     min_stripes, 1);
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_free = max_avail * 2;
+	else
+		min_free = max_avail;
+
+	if (min_free > devices[min_devices - 1].max_avail)
+		return -ENOSPC;
+
+	map = __shrink_map_lookup_stripes(map, min_stripes);
+	*stripe_size = max_avail;
+
+	index = 0;
+	for (i = 0; i < min_stripes; i++) {
+		map->stripes[i].dev = devices[index].dev;
+		map->stripes[i].physical = devices[index].dev_offset;
+		if (type & BTRFS_BLOCK_GROUP_DUP) {
+			i++;
+			map->stripes[i].dev = devices[index].dev;
+			map->stripes[i].physical = devices[index].dev_offset +
+						   max_avail;
+		}
+		index++;
+	}
+	*map_lookup = map;
+
+	return 0;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes, u64 *stripe_size,
+			       u64 start, u64 type)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_device *device = NULL;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+	struct list_head *cur;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_device_info *devices_info;
+	struct list_head private_devs;
+	u64 calc_size = 1024 * 1024 * 1024;
+	u64 min_free;
+	u64 avail;
+	u64 dev_offset;
+	int num_stripes;
+	int min_stripes;
+	int sub_stripes;
+	int min_devices;	/* the min number of devices we need */
+	int i;
+	int ret;
+	int index;
+
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
+	if (list_empty(&fs_devices->alloc_list))
+		return -ENOSPC;
+
+	ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+				    &min_stripes, &sub_stripes);
+	if (ret)
+		return ret;
+
+	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
+
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	map->num_stripes = num_stripes;
 
 	cur = fs_devices->alloc_list.next;
 	index = 0;
+	i = 0;
 
-	if (type & BTRFS_BLOCK_GROUP_DUP)
+	calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+					     num_stripes, 0);
+
+	if (type & BTRFS_BLOCK_GROUP_DUP) {
 		min_free = calc_size * 2;
-	else
+		min_devices = 1;
+	} else {
 		min_free = calc_size;
-
-	/*
-	 * we add 1MB because we never use the first 1MB of the device, unless
-	 * we've looped, then we are likely allocating the maximum amount of
-	 * space left already
-	 */
-	if (!looped)
-		min_free += 1024 * 1024;
+		min_devices = min_stripes;
+	}
 
 	INIT_LIST_HEAD(&private_devs);
 	while (index < num_stripes) {
@@ -2283,27 +2559,39 @@ again:
 		cur = cur->next;
 
 		if (device->in_fs_metadata && avail >= min_free) {
-			ret = find_free_dev_extent(trans, device,
-						   min_free, &dev_offset,
-						   &max_avail);
+			ret = find_free_dev_extent(trans, device, min_free,
+						   &devices_info[i].dev_offset,
+						   &devices_info[i].max_avail);
 			if (ret == 0) {
 				list_move_tail(&device->dev_alloc_list,
 					       &private_devs);
 				map->stripes[index].dev = device;
-				map->stripes[index].physical = dev_offset;
+				map->stripes[index].physical =
+						devices_info[i].dev_offset;
 				index++;
 				if (type & BTRFS_BLOCK_GROUP_DUP) {
 					map->stripes[index].dev = device;
 					map->stripes[index].physical =
-						dev_offset + calc_size;
+						devices_info[i].dev_offset +
+						calc_size;
 					index++;
 				}
-			}
-		} else if (device->in_fs_metadata && avail > max_avail)
-			max_avail = avail;
+			} else if (ret != -ENOSPC)
+				goto error;
+
+			devices_info[i].dev = device;
+			i++;
+		} else if (device->in_fs_metadata &&
+			   avail >= BTRFS_STRIPE_LEN) {
+			devices_info[i].dev = device;
+			devices_info[i].max_avail = avail;
+			i++;
+		}
+
 		if (cur == &fs_devices->alloc_list)
 			break;
 	}
+
 	list_splice(&private_devs, &fs_devices->alloc_list);
 	if (index < num_stripes) {
 		if (index >= min_stripes) {
@@ -2312,34 +2600,36 @@ again:
 				num_stripes /= sub_stripes;
 				num_stripes *= sub_stripes;
 			}
-			looped = 1;
-			goto again;
-		}
-		if (!looped && max_avail > 0) {
-			looped = 1;
-			calc_size = max_avail;
-			goto again;
+
+			map = __shrink_map_lookup_stripes(map, num_stripes);
+		} else if (i >= min_devices) {
+			ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+						       devices_info, i, type,
+						       &map, min_stripes,
+						       &calc_size);
+			if (ret)
+				goto error;
+		} else {
+			ret = -ENOSPC;
+			goto error;
 		}
-		kfree(map);
-		return -ENOSPC;
 	}
 	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = stripe_len;
-	map->io_align = stripe_len;
-	map->io_width = stripe_len;
+	map->stripe_len = BTRFS_STRIPE_LEN;
+	map->io_align = BTRFS_STRIPE_LEN;
+	map->io_width = BTRFS_STRIPE_LEN;
 	map->type = type;
-	map->num_stripes = num_stripes;
 	map->sub_stripes = sub_stripes;
 
 	*map_ret = map;
 	*stripe_size = calc_size;
 	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
+					 map->num_stripes, sub_stripes);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
-		kfree(map);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto error;
 	}
 	em->bdev = (struct block_device *)map;
 	em->start = start;
@@ -2372,7 +2662,13 @@ again:
 		index++;
 	}
 
+	kfree(devices_info);
 	return 0;
+
+error:
+	kfree(map);
+	kfree(devices_info);
+	return ret;
 }
 
 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2740db49eb0..7fb59d45fe8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
 #define __BTRFS_VOLUMES_
 
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "async-thread.h"
 
+#define BTRFS_STRIPE_LEN	(64 * 1024)
+
 struct buffer_head;
 struct btrfs_pending_bios {
 	struct bio *head;
@@ -50,7 +53,7 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
-	/* the mode sent to open_bdev_exclusive */
+	/* the mode sent to blkdev_get */
 	fmode_t mode;
 
 	char *name;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
 	struct btrfs_bio_stripe stripes[];
 };
 
+struct btrfs_device_info {
+	struct btrfs_device *dev;
+	u64 dev_offset;
+	u64 max_avail;
+};
+
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+					struct btrfs_device_info *devices,
+					size_t nr_devices)
+{
+	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_free_bytes, NULL);
+}
+
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+				   u64 end, u64 *length);
+
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739..a5776531dc2 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags)
 {
+	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	/*
 	 * If this is a request for a synthetic attribute in the system.*
 	 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
+	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+	/*
+	 * The permission on security.* and system.* is not checked
+	 * in permission().
+	 */
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
 	/*
 	 * If this is a request for a synthetic attribute in the system.*
 	 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71..f5ec2d44150 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
 #include <linux/bio.h>
 #include "compression.h"
 
-/* Plan: call deflate() with avail_in == *sourcelen,
-	avail_out = *dstlen - 12 and flush == Z_FINISH.
-	If it doesn't manage to finish,	call it again with
-	avail_in == 0 and avail_out set to the remaining 12
-	bytes for it to clean up.
-   Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
-
 struct workspace {
 	z_stream inf_strm;
 	z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
 	struct list_head list;
 };
 
-static LIST_HEAD(idle_workspace);
-static DEFINE_SPINLOCK(workspace_lock);
-static unsigned long num_workspace;
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+static void zlib_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 
-/*
- * this finds an available zlib workspace or allocates a new one
- * NULL or an ERR_PTR is returned if things go bad.
- */
-static struct workspace *find_zlib_workspace(void)
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
 {
 	struct workspace *workspace;
-	int ret;
-	int cpus = num_online_cpus();
-
-again:
-	spin_lock(&workspace_lock);
-	if (!list_empty(&idle_workspace)) {
-		workspace = list_entry(idle_workspace.next, struct workspace,
-				       list);
-		list_del(&workspace->list);
-		num_workspace--;
-		spin_unlock(&workspace_lock);
-		return workspace;
 
-	}
-	spin_unlock(&workspace_lock);
-	if (atomic_read(&alloc_workspace) > cpus) {
-		DEFINE_WAIT(wait);
-		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&alloc_workspace) > cpus)
-			schedule();
-		finish_wait(&workspace_wait, &wait);
-		goto again;
-	}
-	atomic_inc(&alloc_workspace);
 	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-	if (!workspace) {
-		ret = -ENOMEM;
-		goto fail;
-	}
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
 
 	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
-	if (!workspace->def_strm.workspace) {
-		ret = -ENOMEM;
-		goto fail;
-	}
 	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-	if (!workspace->inf_strm.workspace) {
-		ret = -ENOMEM;
-		goto fail_inflate;
-	}
 	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-	if (!workspace->buf) {
-		ret = -ENOMEM;
-		goto fail_kmalloc;
-	}
-	return workspace;
-
-fail_kmalloc:
-	vfree(workspace->inf_strm.workspace);
-fail_inflate:
-	vfree(workspace->def_strm.workspace);
-fail:
-	kfree(workspace);
-	atomic_dec(&alloc_workspace);
-	wake_up(&workspace_wait);
-	return ERR_PTR(ret);
-}
-
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
-	spin_lock(&workspace_lock);
-	if (num_workspace < num_online_cpus()) {
-		list_add_tail(&workspace->list, &idle_workspace);
-		num_workspace++;
-		spin_unlock(&workspace_lock);
-		if (waitqueue_active(&workspace_wait))
-			wake_up(&workspace_wait);
-		return 0;
-	}
-	spin_unlock(&workspace_lock);
-	vfree(workspace->def_strm.workspace);
-	vfree(workspace->inf_strm.workspace);
-	kfree(workspace->buf);
-	kfree(workspace);
+	if (!workspace->def_strm.workspace ||
+	    !workspace->inf_strm.workspace || !workspace->buf)
+		goto fail;
 
-	atomic_dec(&alloc_workspace);
-	if (waitqueue_active(&workspace_wait))
-		wake_up(&workspace_wait);
-	return 0;
-}
+	INIT_LIST_HEAD(&workspace->list);
 
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
-	struct workspace *workspace;
-	while (!list_empty(&idle_workspace)) {
-		workspace = list_entry(idle_workspace.next, struct workspace,
-				       list);
-		list_del(&workspace->list);
-		vfree(workspace->def_strm.workspace);
-		vfree(workspace->inf_strm.workspace);
-		kfree(workspace->buf);
-		kfree(workspace);
-		atomic_dec(&alloc_workspace);
-	}
+	return &workspace->list;
+fail:
+	zlib_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
 }
 
-/*
- * given an address space and start/len, compress the bytes.
- *
- * pages are allocated to hold the compressed result and stored
- * in 'pages'
- *
- * out_pages is used to return the number of pages allocated.  There
- * may be pages allocated even if we return an error
- *
- * total_in is used to return the number of bytes actually read.  It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-			      u64 start, unsigned long len,
-			      struct page **pages,
-			      unsigned long nr_dest_pages,
-			      unsigned long *out_pages,
-			      unsigned long *total_in,
-			      unsigned long *total_out,
-			      unsigned long max_out)
+static int zlib_compress_pages(struct list_head *ws,
+			       struct address_space *mapping,
+			       u64 start, unsigned long len,
+			       struct page **pages,
+			       unsigned long nr_dest_pages,
+			       unsigned long *out_pages,
+			       unsigned long *total_in,
+			       unsigned long *total_out,
+			       unsigned long max_out)
 {
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret;
-	struct workspace *workspace;
 	char *data_in;
 	char *cpage_out;
 	int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	*total_out = 0;
 	*total_in = 0;
 
-	workspace = find_zlib_workspace();
-	if (IS_ERR(workspace))
-		return -1;
-
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
 		printk(KERN_WARNING "deflateInit failed\n");
 		ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	data_in = kmap(in_page);
 
 	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -1;
+		goto out;
+	}
 	cpage_out = kmap(out_page);
 	pages[0] = out_page;
 	nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 				goto out;
 			}
 			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -1;
+				goto out;
+			}
 			cpage_out = kmap(out_page);
 			pages[nr_pages] = out_page;
 			nr_pages++;
@@ -314,55 +208,26 @@ out:
 		kunmap(in_page);
 		page_cache_release(in_page);
 	}
-	free_workspace(workspace);
 	return ret;
 }
 
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous.  They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-			      u64 disk_start,
-			      struct bio_vec *bvec,
-			      int vcnt,
-			      size_t srclen)
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+				  u64 disk_start,
+				  struct bio_vec *bvec,
+				  int vcnt,
+				  size_t srclen)
 {
-	int ret = 0;
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	int ret = 0, ret2;
 	int wbits = MAX_WBITS;
-	struct workspace *workspace;
 	char *data_in;
 	size_t total_out = 0;
-	unsigned long page_bytes_left;
 	unsigned long page_in_index = 0;
 	unsigned long page_out_index = 0;
-	struct page *page_out;
 	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
 					PAGE_CACHE_SIZE;
 	unsigned long buf_start;
-	unsigned long buf_offset;
-	unsigned long bytes;
-	unsigned long working_bytes;
 	unsigned long pg_offset;
-	unsigned long start_byte;
-	unsigned long current_buf_start;
-	char *kaddr;
-
-	workspace = find_zlib_workspace();
-	if (IS_ERR(workspace))
-		return -ENOMEM;
 
 	data_in = kmap(pages_in[page_in_index]);
 	workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 	workspace->inf_strm.total_out = 0;
 	workspace->inf_strm.next_out = workspace->buf;
 	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
-	page_out = bvec[page_out_index].bv_page;
-	page_bytes_left = PAGE_CACHE_SIZE;
 	pg_offset = 0;
 
 	/* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
 		printk(KERN_WARNING "inflateInit failed\n");
-		ret = -1;
-		goto out;
+		return -1;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
 		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		/*
-		 * buf start is the byte offset we're of the start of
-		 * our workspace buffer
-		 */
-		buf_start = total_out;
 
-		/* total_out is the last byte of the workspace buffer */
+		buf_start = total_out;
 		total_out = workspace->inf_strm.total_out;
 
-		working_bytes = total_out - buf_start;
-
-		/*
-		 * start byte is the first byte of the page we're currently
-		 * copying into relative to the start of the compressed data.
-		 */
-		start_byte = page_offset(page_out) - disk_start;
-
-		if (working_bytes == 0) {
-			/* we didn't make progress in this inflate
-			 * call, we're done
-			 */
-			if (ret != Z_STREAM_END)
-				ret = -1;
+		/* we didn't make progress in this inflate call, we're done */
+		if (buf_start == total_out)
 			break;
-		}
 
-		/* we haven't yet hit data corresponding to this page */
-		if (total_out <= start_byte)
-			goto next;
-
-		/*
-		 * the start of the data we care about is offset into
-		 * the middle of our working buffer
-		 */
-		if (total_out > start_byte && buf_start < start_byte) {
-			buf_offset = start_byte - buf_start;
-			working_bytes -= buf_offset;
-		} else {
-			buf_offset = 0;
-		}
-		current_buf_start = buf_start;
-
-		/* copy bytes from the working buffer into the pages */
-		while (working_bytes > 0) {
-			bytes = min(PAGE_CACHE_SIZE - pg_offset,
-				    PAGE_CACHE_SIZE - buf_offset);
-			bytes = min(bytes, working_bytes);
-			kaddr = kmap_atomic(page_out, KM_USER0);
-			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-			       bytes);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(page_out);
-
-			pg_offset += bytes;
-			page_bytes_left -= bytes;
-			buf_offset += bytes;
-			working_bytes -= bytes;
-			current_buf_start += bytes;
-
-			/* check if we need to pick another page */
-			if (page_bytes_left == 0) {
-				page_out_index++;
-				if (page_out_index >= vcnt) {
-					ret = 0;
-					goto done;
-				}
-
-				page_out = bvec[page_out_index].bv_page;
-				pg_offset = 0;
-				page_bytes_left = PAGE_CACHE_SIZE;
-				start_byte = page_offset(page_out) - disk_start;
-
-				/*
-				 * make sure our new page is covered by this
-				 * working buffer
-				 */
-				if (total_out <= start_byte)
-					goto next;
-
-				/* the next page in the biovec might not
-				 * be adjacent to the last page, but it
-				 * might still be found inside this working
-				 * buffer.  bump our offset pointer
-				 */
-				if (total_out > start_byte &&
-				    current_buf_start < start_byte) {
-					buf_offset = start_byte - buf_start;
-					working_bytes = total_out - start_byte;
-					current_buf_start = buf_start +
-						buf_offset;
-				}
-			}
+		ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+						 total_out, disk_start,
+						 bvec, vcnt,
+						 &page_out_index, &pg_offset);
+		if (ret2 == 0) {
+			ret = 0;
+			goto done;
 		}
-next:
+
 		workspace->inf_strm.next_out = workspace->buf;
 		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 
@@ -516,35 +301,21 @@ done:
 	zlib_inflateEnd(&workspace->inf_strm);
 	if (data_in)
 		kunmap(pages_in[page_in_index]);
-out:
-	free_workspace(workspace);
 	return ret;
 }
 
-/*
- * a less complex decompression routine.  Our compressed data fits in a
- * single page, and we want to read a single page out of it.
- * start_byte tells us the offset into the compressed data we're interested in
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
-			  struct page *dest_page,
-			  unsigned long start_byte,
-			  size_t srclen, size_t destlen)
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+			   struct page *dest_page,
+			   unsigned long start_byte,
+			   size_t srclen, size_t destlen)
 {
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	int ret = 0;
 	int wbits = MAX_WBITS;
-	struct workspace *workspace;
 	unsigned long bytes_left = destlen;
 	unsigned long total_out = 0;
 	char *kaddr;
 
-	if (destlen > PAGE_CACHE_SIZE)
-		return -ENOMEM;
-
-	workspace = find_zlib_workspace();
-	if (IS_ERR(workspace))
-		return -ENOMEM;
-
 	workspace->inf_strm.next_in = data_in;
 	workspace->inf_strm.avail_in = srclen;
 	workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
 		printk(KERN_WARNING "inflateInit failed\n");
-		ret = -1;
-		goto out;
+		return -1;
 	}
 
 	while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
 		ret = 0;
 
 	zlib_inflateEnd(&workspace->inf_strm);
-out:
-	free_workspace(workspace);
 	return ret;
 }
 
-void btrfs_zlib_exit(void)
-{
-    free_workspaces();
-}
+struct btrfs_compress_op btrfs_zlib_compress = {
+	.alloc_workspace	= zlib_alloc_workspace,
+	.free_workspace		= zlib_free_workspace,
+	.compress_pages		= zlib_compress_pages,
+	.decompress_biovec	= zlib_decompress_biovec,
+	.decompress		= zlib_decompress,
+};
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff..bd352125e82 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
 # Makefile for CEPH filesystem.
 #
 
-ifneq ($(KERNELRELEASE),)
-
 obj-$(CONFIG_CEPH_FS) += ceph.o
 
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	export.o caps.o snap.o xattr.o \
 	mds_client.o mdsmap.o strings.o ceph_frag.o \
 	debugfs.o
 
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-
-default: all
-
-all:
-	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-
-modules_install:
-	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-
-clean:
-	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-
-endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b5..08f65faac11 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
 	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
 		req = rb_entry(rp, struct ceph_mds_request, r_node);
 
-		if (req->r_request)
-			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
-		else
+		if (req->r_request && req->r_session)
+			seq_printf(s, "%lld\tmds%d\t", req->r_tid,
+				   req->r_session->s_mds);
+		else if (!req->r_request)
 			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+		else
+			seq_printf(s, "%lld\t(no session)\t", req->r_tid);
 
 		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fa7ca04ee81..0bc68de8edd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1224,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	}
 }
 
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+	struct inode *dir = dn->d_parent->d_inode;
+	struct ceph_inode_info *dci = ceph_inode(dir);
+
+	switch (dci->i_dir_layout.dl_dir_hash) {
+	case 0:	/* for backward compat */
+	case CEPH_STR_HASH_LINUX:
+		return dn->d_name.hash;
+
+	default:
+		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+				     dn->d_name.name, dn->d_name.len);
+	}
+}
+
 const struct file_operations ceph_dir_fops = {
 	.read = ceph_read_dir,
 	.readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d942699..e41056174bf 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 		dout("encode_fh %p connectable\n", dentry);
 		cfh->ino = ceph_ino(dentry->d_inode);
 		cfh->parent_ino = ceph_ino(parent->d_inode);
-		cfh->parent_name_hash = parent->d_name.hash;
+		cfh->parent_name_hash = ceph_dentry_hash(parent);
 		*max_len = connected_handle_length;
 		type = 2;
 	} else if (*max_len >= handle_length) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e61de4f7b99..e835eff551e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_release_count = 0;
 	ci->i_symlink = NULL;
 
+	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+
 	ci->i_fragtree = RB_ROOT;
 	mutex_init(&ci->i_fragtree_mutex);
 
@@ -689,6 +691,8 @@ static int fill_inode(struct inode *inode,
 		inode->i_op = &ceph_dir_iops;
 		inode->i_fop = &ceph_dir_fops;
 
+		ci->i_dir_layout = iinfo->dir_layout;
+
 		ci->i_files = le64_to_cpu(info->files);
 		ci->i_subdirs = le64_to_cpu(info->subdirs);
 		ci->i_rbytes = le64_to_cpu(info->rbytes);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50fca1e03b..1e30d194a8e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
  * parse individual inode info
  */
 static int parse_reply_info_in(void **p, void *end,
-			       struct ceph_mds_reply_info_in *info)
+			       struct ceph_mds_reply_info_in *info,
+			       int features)
 {
 	int err = -EIO;
 
@@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
 	info->symlink = *p;
 	*p += info->symlink_len;
 
+	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+		ceph_decode_copy_safe(p, end, &info->dir_layout,
+				      sizeof(info->dir_layout), bad);
+	else
+		memset(&info->dir_layout, 0, sizeof(info->dir_layout));
+
 	ceph_decode_32_safe(p, end, info->xattr_len, bad);
 	ceph_decode_need(p, end, info->xattr_len, bad);
 	info->xattr_data = *p;
@@ -88,12 +95,13 @@ bad:
  * target inode.
  */
 static int parse_reply_info_trace(void **p, void *end,
-				  struct ceph_mds_reply_info_parsed *info)
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
 {
 	int err;
 
 	if (info->head->is_dentry) {
-		err = parse_reply_info_in(p, end, &info->diri);
+		err = parse_reply_info_in(p, end, &info->diri, features);
 		if (err < 0)
 			goto out_bad;
 
@@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
 	}
 
 	if (info->head->is_target) {
-		err = parse_reply_info_in(p, end, &info->targeti);
+		err = parse_reply_info_in(p, end, &info->targeti, features);
 		if (err < 0)
 			goto out_bad;
 	}
@@ -134,7 +142,8 @@ out_bad:
  * parse readdir results
  */
 static int parse_reply_info_dir(void **p, void *end,
-				struct ceph_mds_reply_info_parsed *info)
+				struct ceph_mds_reply_info_parsed *info,
+				int features)
 {
 	u32 num, i = 0;
 	int err;
@@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
 		*p += sizeof(struct ceph_mds_reply_lease);
 
 		/* inode */
-		err = parse_reply_info_in(p, end, &info->dir_in[i]);
+		err = parse_reply_info_in(p, end, &info->dir_in[i], features);
 		if (err < 0)
 			goto out_bad;
 		i++;
@@ -205,7 +214,8 @@ out_bad:
  * parse fcntl F_GETLK results
  */
 static int parse_reply_info_filelock(void **p, void *end,
-                struct ceph_mds_reply_info_parsed *info)
+				     struct ceph_mds_reply_info_parsed *info,
+				     int features)
 {
 	if (*p + sizeof(*info->filelock_reply) > end)
 		goto bad;
@@ -225,19 +235,21 @@ bad:
  * parse extra results
  */
 static int parse_reply_info_extra(void **p, void *end,
-                struct ceph_mds_reply_info_parsed *info)
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
-		return parse_reply_info_filelock(p, end, info);
+		return parse_reply_info_filelock(p, end, info, features);
 	else
-		return parse_reply_info_dir(p, end, info);
+		return parse_reply_info_dir(p, end, info, features);
 }
 
 /*
  * parse entire mds reply
  */
 static int parse_reply_info(struct ceph_msg *msg,
-			    struct ceph_mds_reply_info_parsed *info)
+			    struct ceph_mds_reply_info_parsed *info,
+			    int features)
 {
 	void *p, *end;
 	u32 len;
@@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* trace */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
-		err = parse_reply_info_trace(&p, p+len, info);
+		err = parse_reply_info_trace(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
 	}
@@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
-		err = parse_reply_info_extra(&p, p+len, info);
+		err = parse_reply_info_extra(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
 	}
@@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		} else {
 			/* dir + name */
 			inode = dir;
-			hash = req->r_dentry->d_name.hash;
+			hash = ceph_dentry_hash(req->r_dentry);
 			is_hash = true;
 		}
 	}
@@ -1693,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg;
 	int flags = 0;
 
-	req->r_mds = mds;
 	req->r_attempts++;
 	if (req->r_inode) {
 		struct ceph_cap *cap =
@@ -1780,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
 		goto finish;
 	}
 
+	put_request_session(req);
+
 	mds = __choose_mds(mdsc, req);
 	if (mds < 0 ||
 	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1797,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
 			goto finish;
 		}
 	}
+	req->r_session = get_session(session);
+
 	dout("do_request mds%d session %p state %s\n", mds, session,
 	     session_state_name(session->s_state));
 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1809,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	}
 
 	/* send request */
-	req->r_session = get_session(session);
 	req->r_resend_mds = -1;   /* forget any previous mds hint */
 
 	if (req->r_request_started == 0)   /* note request start time */
@@ -1863,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 		if (req->r_session &&
 		    req->r_session->s_mds == mds) {
 			dout(" kicking tid %llu\n", req->r_tid);
-			put_request_session(req);
 			__do_request(mdsc, req);
 		}
 	}
@@ -2056,8 +2069,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			goto out;
 		} else  {
 			struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-			struct ceph_cap *cap =
-				ceph_get_cap_for_mds(ci, req->r_mds);;
+			struct ceph_cap *cap = NULL;
+
+			if (req->r_session)
+				cap = ceph_get_cap_for_mds(ci,
+						   req->r_session->s_mds);
 
 			dout("already using auth");
 			if ((!cap || cap != ci->i_auth_cap) ||
@@ -2101,7 +2117,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	dout("handle_reply tid %lld result %d\n", tid, result);
 	rinfo = &req->r_reply_info;
-	err = parse_reply_info(msg, rinfo);
+	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 
 	mutex_lock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index aabe563b54d..4e3a9cc0bba 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
  */
 struct ceph_mds_reply_info_in {
 	struct ceph_mds_reply_inode *in;
+	struct ceph_dir_layout dir_layout;
 	u32 symlink_len;
 	char *symlink;
 	u32 xattr_len;
@@ -165,7 +166,6 @@ struct ceph_mds_request {
 	struct ceph_mds_client *r_mdsc;
 
 	int r_op;                    /* mds op code */
-	int r_mds;
 
 	/* operation on what? */
 	struct inode *r_inode;              /* arg1 */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae053..bf6f0f34082 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 		goto fail;
 	}
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-	fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+	fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+		CEPH_FEATURE_DIRLAYOUTHASH;
 	fsc->client->monc.want_mdsmap = 1;
 
 	fsc->mount_options = fsopt;
@@ -443,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 		goto fail_client;
 
 	err = -ENOMEM;
-	fsc->wb_wq = create_workqueue("ceph-writeback");
+	/*
+	 * The number of concurrent works can be high but they don't need
+	 * to be processed in parallel, limit concurrency.
+	 */
+	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
 	if (fsc->wb_wq == NULL)
 		goto fail_bdi;
-	fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
 	if (fsc->pg_inv_wq == NULL)
 		goto fail_wb_wq;
-	fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
 	if (fsc->trunc_wq == NULL)
 		goto fail_pg_inv_wq;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4553d8829ed..20b907d76ae 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
 	unsigned i_ceph_flags;
 	unsigned long i_release_count;
 
+	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
 	char *i_symlink;
 
@@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
 
 /*
  * our d_ops vary depending on whether the inode is live,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b9..dca9e5e0f73 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
 
 /* index in the above */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
 	return major % CHRDEV_MAJOR_HASH_SIZE;
 }
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	return ret;
 }
 
-int cdev_index(struct inode *inode)
-{
-	int idx;
-	struct kobject *kobj;
-
-	kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-	if (!kobj)
-		return -1;
-	kobject_put(kobj);
-	return idx;
-}
-
 void cd_forget(struct inode *inode)
 {
 	spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fc..e654dfd092c 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
 				   void *buffer, uint16_t maxbuf)
 {
 	const struct TCP_Server_Info *server = cookie_netfs_data;
-	const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+	const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+	const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+	const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
 	struct cifs_server_key *key = buffer;
 	uint16_t key_len = sizeof(struct cifs_server_key);
 
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
 	 */
 	switch (sa->sa_family) {
 	case AF_INET:
-		key->family = server->addr.sockAddr.sin_family;
-		key->port = server->addr.sockAddr.sin_port;
-		key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+		key->family = sa->sa_family;
+		key->port = addr->sin_port;
+		key->addr[0].ipv4_addr = addr->sin_addr;
 		key_len += sizeof(key->addr[0].ipv4_addr);
 		break;
 
 	case AF_INET6:
-		key->family = server->addr.sockAddr6.sin6_family;
-		key->port = server->addr.sockAddr6.sin6_port;
-		key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+		key->family = sa->sa_family;
+		key->port = addr6->sin6_port;
+		key->addr[0].ipv6_addr = addr6->sin6_addr;
 		key_len += sizeof(key->addr[0].ipv6_addr);
 		break;
 
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b..65829d32128 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+		cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
 			mid_entry->midState,
 			(int)mid_entry->command,
 			mid_entry->pid,
-			mid_entry->tsk,
+			mid_entry->callback_data,
 			mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
 		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 		    "Display Internal CIFS Data Structures for Debugging\n"
 		    "---------------------------------------------------\n");
 	seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-	seq_printf(m, "Features: ");
+	seq_printf(m, "Features:");
 #ifdef CONFIG_CIFS_DFS_UPCALL
-	seq_printf(m, "dfs");
-	seq_putc(m, ' ');
+	seq_printf(m, " dfs");
 #endif
 #ifdef CONFIG_CIFS_FSCACHE
-	seq_printf(m, "fscache");
-	seq_putc(m, ' ');
+	seq_printf(m, " fscache");
 #endif
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-	seq_printf(m, "lanman");
-	seq_putc(m, ' ');
+	seq_printf(m, " lanman");
 #endif
 #ifdef CONFIG_CIFS_POSIX
-	seq_printf(m, "posix");
-	seq_putc(m, ' ');
+	seq_printf(m, " posix");
 #endif
 #ifdef CONFIG_CIFS_UPCALL
-	seq_printf(m, "spnego");
-	seq_putc(m, ' ');
+	seq_printf(m, " spnego");
 #endif
 #ifdef CONFIG_CIFS_XATTR
-	seq_printf(m, "xattr");
+	seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+	seq_printf(m, " acl");
 #endif
 	seq_putc(m, '\n');
 	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
@@ -220,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				mid_entry = list_entry(tmp3, struct mid_q_entry,
 					qhead);
 				seq_printf(m, "\tState: %d com: %d pid:"
-						" %d tsk: %p mid %d\n",
+						" %d cbdata: %p mid %d\n",
 						mid_entry->midState,
 						(int)mid_entry->command,
 						mid_entry->pid,
-						mid_entry->tsk,
+						mid_entry->callback_data,
 						mid_entry->mid);
 			}
 			spin_unlock(&GlobalMid_Lock);
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 				atomic_read(&totSmBufAllocCount));
 #endif /* CONFIG_CIFS_STATS2 */
 
-	seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+	seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
 	seq_printf(m,
 		"\n%d session %d share reconnects\n",
 		tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27f..7ed36536e75 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 
 }
 
-static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
-				struct list_head *mntlist)
-{
-	/* stolen from afs code */
-	int err;
-
-	mntget(newmnt);
-	err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
-	switch (err) {
-	case 0:
-		path_put(&nd->path);
-		nd->path.mnt = newmnt;
-		nd->path.dentry = dget(newmnt->mnt_root);
-		schedule_delayed_work(&cifs_dfs_automount_task,
-				      cifs_dfs_mountpoint_expiry_timeout);
-		break;
-	case -EBUSY:
-		/* someone else made a mount here whilst we were busy */
-		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path))
-			;
-		err = 0;
-	default:
-		mntput(newmnt);
-		break;
-	}
-	return err;
-}
-
 static void dump_referral(const struct dfs_info3_param *ref)
 {
 	cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,45 +264,43 @@ static void dump_referral(const struct dfs_info3_param *ref)
 				ref->path_consumed);
 }
 
-
-static void*
-cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+/*
+ * Create a vfsmount that we can automount
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 {
 	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals = 0;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsSesInfo *ses;
-	char *full_path = NULL;
+	char *full_path;
 	int xid, i;
-	int rc = 0;
-	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	int rc;
+	struct vfsmount *mnt;
 	struct tcon_link *tlink;
 
 	cFYI(1, "in %s", __func__);
-	BUG_ON(IS_ROOT(dentry));
+	BUG_ON(IS_ROOT(mntpt));
 
 	xid = GetXid();
 
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
-
 	/*
 	 * The MSDFS spec states that paths in DFS referral requests and
 	 * responses must be prefixed by a single '\' character instead of
 	 * the double backslashes usually used in the UNC. This function
 	 * gives us the latter, so we must adjust the result.
 	 */
-	full_path = build_path_from_dentry(dentry);
-	if (full_path == NULL) {
-		rc = -ENOMEM;
-		goto out_err;
-	}
+	mnt = ERR_PTR(-ENOMEM);
+	full_path = build_path_from_dentry(mntpt);
+	if (full_path == NULL)
+		goto free_xid;
 
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
+	mnt = ERR_PTR(-EINVAL);
 	if (IS_ERR(tlink)) {
-		rc = PTR_ERR(tlink);
-		goto out_err;
+		mnt = ERR_CAST(tlink);
+		goto free_full_path;
 	}
 	ses = tlink_tcon(tlink)->ses;
 
@@ -341,46 +310,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 
 	cifs_put_tlink(tlink);
 
+	mnt = ERR_PTR(-ENOENT);
 	for (i = 0; i < num_referrals; i++) {
 		int len;
-		dump_referral(referrals+i);
+		dump_referral(referrals + i);
 		/* connect to a node */
 		len = strlen(referrals[i].node_name);
 		if (len < 2) {
 			cERROR(1, "%s: Net Address path too short: %s",
 					__func__, referrals[i].node_name);
-			rc = -EINVAL;
-			goto out_err;
+			mnt = ERR_PTR(-EINVAL);
+			break;
 		}
 		mnt = cifs_dfs_do_refmount(cifs_sb,
 				full_path, referrals + i);
 		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
 					referrals[i].node_name, mnt);
-
-		/* complete mount procedure if we accured submount */
 		if (!IS_ERR(mnt))
-			break;
+			goto success;
 	}
 
-	/* we need it cause for() above could exit without valid submount */
-	rc = PTR_ERR(mnt);
-	if (IS_ERR(mnt))
-		goto out_err;
-
-	rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
+	/* no valid submounts were found; return error from get_dfs_path() by
+	 * preference */
+	if (rc != 0)
+		mnt = ERR_PTR(rc);
 
-out:
-	FreeXid(xid);
+success:
 	free_dfs_info_array(referrals, num_referrals);
+free_full_path:
 	kfree(full_path);
+free_xid:
+	FreeXid(xid);
 	cFYI(1, "leaving %s" , __func__);
-	return ERR_PTR(rc);
-out_err:
-	path_put(&nd->path);
-	goto out;
+	return mnt;
+}
+
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+	struct vfsmount *newmnt;
+
+	cFYI(1, "in %s", __func__);
+
+	newmnt = cifs_dfs_do_automount(path->dentry);
+	if (IS_ERR(newmnt)) {
+		cFYI(1, "leaving %s [automount failed]" , __func__);
+		return newmnt;
+	}
+
+	mntget(newmnt); /* prevent immediate expiration */
+	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+	schedule_delayed_work(&cifs_dfs_automount_task,
+			      cifs_dfs_mountpoint_expiry_timeout);
+	cFYI(1, "leaving %s [ok]" , __func__);
+	return newmnt;
 }
 
 const struct inode_operations cifs_dfs_referral_inode_operations = {
-	.follow_link = cifs_dfs_follow_mountpoint,
 };
-
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7852cd67705..ac51cd2d33a 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -40,6 +40,7 @@
 #define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
 #define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
 
 struct cifs_sb_info {
 	struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1..4dfba828316 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
 	struct TCP_Server_Info *server = sesInfo->server;
+	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
 	char *description, *dp;
 	size_t desc_len;
 	struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 
 	/* add the server address */
-	if (server->addr.sockAddr.sin_family == AF_INET)
-		sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
-	else if (server->addr.sockAddr.sin_family == AF_INET6)
-		sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
+	if (server->dstaddr.ss_family == AF_INET)
+		sprintf(dp, "ip4=%pI4", &sa->sin_addr);
+	else if (server->dstaddr.ss_family == AF_INET6)
+		sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
 	else
 		goto out;
 
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a172..fc0fd4fde30 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 	int charlen, outlen = 0;
 	int maxwords = maxbytes / 2;
 	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp;
 
-	for (i = 0; i < maxwords && from[i]; i++) {
-		charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
-					     NLS_MAX_CHARSET_SIZE);
+	for (i = 0; i < maxwords; i++) {
+		ftmp = get_unaligned_le16(&from[i]);
+		if (ftmp == 0)
+			break;
+
+		charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
 		if (charlen > 0)
 			outlen += charlen;
 		else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 }
 
 /*
- * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
  * @target - where converted character should be copied
- * @src_char - 2 byte little-endian source character
+ * @src_char - 2 byte host-endian source character
  * @cp - codepage to which character should be converted
  * @mapchar - should character be mapped according to mapchars mount option?
  *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
  * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
  */
 static int
-cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
 	     bool mapchar)
 {
 	int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
 	 *     build_path_from_dentry are modified, as they use slash as
 	 *     separator.
 	 */
-	switch (le16_to_cpu(src_char)) {
+	switch (src_char) {
 	case UNI_COLON:
 		*target = ':';
 		break;
@@ -109,8 +113,7 @@ out:
 	return len;
 
 cp_convert:
-	len = cp->uni2char(le16_to_cpu(src_char), target,
-			   NLS_MAX_CHARSET_SIZE);
+	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
 	if (len <= 0) {
 		*target = '?';
 		len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 	int nullsize = nls_nullsize(codepage);
 	int fromwords = fromlen / 2;
 	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp;
 
 	/*
 	 * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 	 */
 	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
 
-	for (i = 0; i < fromwords && from[i]; i++) {
+	for (i = 0; i < fromwords; i++) {
+		ftmp = get_unaligned_le16(&from[i]);
+		if (ftmp == 0)
+			break;
+
 		/*
 		 * check to see if converting this character might make the
 		 * conversion bleed into the null terminator
 		 */
 		if (outlen >= safelen) {
-			charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+			charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
 			if ((outlen + charlen) > (tolen - nullsize))
 				break;
 		}
 
 		/* put converted char into 'to' buffer */
-		charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
 		outlen += charlen;
 	}
 
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 {
 	int charlen;
 	int i;
-	wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */
+	wchar_t wchar_to; /* needed to quiet sparse */
 
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
-
-		/* works for 2.4.0 kernel or later */
-		charlen = codepage->char2uni(from, len, &wchar_to[i]);
+		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-			cERROR(1, "strtoUCS: char2uni of %d returned %d",
-				(int)*from, charlen);
+			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+				*from, charlen);
 			/* A question mark */
-			to[i] = cpu_to_le16(0x003f);
+			wchar_to = 0x003f;
 			charlen = 1;
-		} else
-			to[i] = cpu_to_le16(wchar_to[i]);
-
+		}
+		put_unaligned_le16(wchar_to, &to[i]);
 	}
 
-	to[i] = 0;
+	put_unaligned_le16(0, &to[i]);
 	return i;
 }
 
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
 	return dst;
 }
 
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+		 const struct nls_table *cp, int mapChars)
+{
+	int i, j, charlen;
+	int len_remaining = maxlen;
+	char src_char;
+	__u16 temp;
+
+	if (!mapChars)
+		return cifs_strtoUCS(target, source, PATH_MAX, cp);
+
+	for (i = 0, j = 0; i < maxlen; j++) {
+		src_char = source[i];
+		switch (src_char) {
+		case 0:
+			put_unaligned_le16(0, &target[j]);
+			goto ctoUCS_out;
+		case ':':
+			temp = UNI_COLON;
+			break;
+		case '*':
+			temp = UNI_ASTERIK;
+			break;
+		case '?':
+			temp = UNI_QUESTION;
+			break;
+		case '<':
+			temp = UNI_LESSTHAN;
+			break;
+		case '>':
+			temp = UNI_GRTRTHAN;
+			break;
+		case '|':
+			temp = UNI_PIPE;
+			break;
+		/*
+		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+		 * until all the calls to build_path_from_dentry are modified,
+		 * as they use backslash as separator.
+		 */
+		default:
+			charlen = cp->char2uni(source+i, len_remaining,
+						&temp);
+			/*
+			 * if no match, use question mark, which at least in
+			 * some cases serves as wild card
+			 */
+			if (charlen < 1) {
+				temp = 0x003f;
+				charlen = 1;
+			}
+			len_remaining -= charlen;
+			/*
+			 * character may take more than one byte in the source
+			 * string, but will take exactly two bytes in the
+			 * target string
+			 */
+			i += charlen;
+			continue;
+		}
+		put_unaligned_le16(temp, &target[j]);
+		i++; /* move to next char in source string */
+		len_remaining--;
+	}
+
+ctoUCS_out:
+	return i;
+}
+
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a437ec391a0..1e7636b145a 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 ;
 
 
-/* security id for everyone */
+/* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
 	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+	1, 1, {0, 0, 0, 0, 0, 5}, {11} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
 
@@ -365,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 	if (num_aces  > 0) {
 		umode_t user_mask = S_IRWXU;
 		umode_t group_mask = S_IRWXG;
-		umode_t other_mask = S_IRWXO;
+		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
 
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
@@ -390,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &other_mask);
+			if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+				access_flags_to_mode(ppace[i]->access_req,
+						     ppace[i]->type,
+						     &fattr->cf_mode,
+						     &other_mask);
+
 
 /*			memcpy((void *)(&(cifscred->aces[i])),
 				(void *)ppace[i],
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161a..66f3d50d067 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -72,6 +72,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 	return 0;
 }
 
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 		  __u32 *pexpected_response_sequence_number)
 {
@@ -84,14 +85,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 	if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
 		return rc;
 
-	spin_lock(&GlobalMid_Lock);
 	cifs_pdu->Signature.Sequence.SequenceNumber =
 			cpu_to_le32(server->sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
 	*pexpected_response_sequence_number = server->sequence_number++;
 	server->sequence_number++;
-	spin_unlock(&GlobalMid_Lock);
 
 	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
 	if (rc)
@@ -149,6 +148,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 	return rc;
 }
 
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 		   __u32 *pexpected_response_sequence_number)
 {
@@ -162,14 +162,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
 		return rc;
 
-	spin_lock(&GlobalMid_Lock);
 	cifs_pdu->Signature.Sequence.SequenceNumber =
 				cpu_to_le32(server->sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
 	*pexpected_response_sequence_number = server->sequence_number++;
 	server->sequence_number++;
-	spin_unlock(&GlobalMid_Lock);
 
 	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
 	if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8e21e0fe65d..a8323f1dc1c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
 				   "Default: 50 Range: 2 to 256");
-
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+			       "reconnecting server. Default: 5. 0 means "
+			       "never reconnect.");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
 		goto out_no_root;
 	}
 
+	/* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+	if (cifs_sb_master_tcon(cifs_sb)->nocase)
+		sb->s_d_op = &cifs_ci_dentry_ops;
+	else
+		sb->s_d_op = &cifs_dentry_ops;
+
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cFYI(1, "export ops supported");
@@ -329,6 +339,8 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->invalid_mapping = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->server_eof = 0;
+	cifs_inode->uniqueid = 0;
+	cifs_inode->createtime = 0;
 
 	/* Can not set i_flags here - they get immediately overwritten
 	   to zero by the VFS */
@@ -361,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
 static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
+	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
+
 	seq_printf(s, ",addr=");
 
-	switch (server->addr.sockAddr.sin_family) {
+	switch (server->dstaddr.ss_family) {
 	case AF_INET:
-		seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+		seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
 		break;
 	case AF_INET6:
-		seq_printf(s, "%pI6",
-			   &server->addr.sockAddr6.sin6_addr.s6_addr);
-		if (server->addr.sockAddr6.sin6_scope_id)
-			seq_printf(s, "%%%u",
-				   server->addr.sockAddr6.sin6_scope_id);
+		seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
+		if (sa6->sin6_scope_id)
+			seq_printf(s, "%%%u", sa6->sin6_scope_id);
 		break;
 	default:
 		seq_printf(s, "(unknown)");
@@ -720,6 +733,25 @@ const struct file_operations cifs_file_ops = {
 	.setlease = cifs_setlease,
 };
 
+const struct file_operations cifs_file_strict_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_strict_readv,
+	.aio_write = cifs_file_aio_write,
+	.open = cifs_open,
+	.release = cifs_close,
+	.lock = cifs_lock,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_strict_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
 const struct file_operations cifs_file_direct_ops = {
 	/* no aio, no readv -
 	   BB reevaluate whether they can be done with directio, no cache */
@@ -738,6 +770,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 };
+
 const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
@@ -756,6 +789,24 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.setlease = cifs_setlease,
 };
 
+const struct file_operations cifs_file_strict_nobrl_ops = {
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = cifs_strict_readv,
+	.aio_write = cifs_file_aio_write,
+	.open = cifs_open,
+	.release = cifs_close,
+	.fsync = cifs_strict_fsync,
+	.flush = cifs_flush,
+	.mmap = cifs_file_strict_mmap,
+	.splice_read = generic_file_splice_read,
+	.llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+	.unlocked_ioctl	= cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+	.setlease = cifs_setlease,
+};
+
 const struct file_operations cifs_file_direct_nobrl_ops = {
 	/* no mmap, no aio, no readv -
 	   BB reevaluate whether they can be done with directio, no cache */
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b..f23206d4653 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
+extern void cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
@@ -72,19 +73,25 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
 extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
-extern const struct file_operations cifs_file_nobrl_ops;
-extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-			 size_t read_size, loff_t *poffset);
+			      size_t read_size, loff_t *poffset);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
+extern int cifs_strict_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -93,6 +100,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern const struct dentry_operations cifs_dentry_ops;
 extern const struct dentry_operations cifs_ci_dentry_ops;
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
+
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
 extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.68"
+#define CIFS_VERSION   "1.69"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7136c0c3e2f..5bfb75346cb 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -161,35 +161,24 @@ struct TCP_Server_Info {
 	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	enum statusEnum tcpStatus; /* what we think the status is */
 	char *hostname; /* hostname portion of UNC string */
 	struct socket *ssocket;
-	union {
-		struct sockaddr_in sockAddr;
-		struct sockaddr_in6 sockAddr6;
-	} addr;
+	struct sockaddr_storage dstaddr;
 	struct sockaddr_storage srcaddr; /* locally bind to this IP */
 	wait_queue_head_t response_q;
 	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
 	struct list_head pending_mid_q;
-	void *Server_NlsInfo;	/* BB - placeholder for future NLS info  */
-	unsigned short server_codepage;	/* codepage for the server    */
-	enum protocolEnum protocolType;
-	char versionMajor;
-	char versionMinor;
-	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
 	bool tcp_nodelay;
 	atomic_t inFlight;  /* number of requests on the wire to server */
-#ifdef CONFIG_CIFS_STATS2
-	atomic_t inSend; /* requests trying to send */
-	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
-#endif
-	enum statusEnum tcpStatus; /* what we think the status is */
 	struct mutex srv_mutex;
 	struct task_struct *tsk;
 	char server_GUID[16];
 	char secMode;
+	bool session_estab; /* mark when very first sess is established */
+	u16 dialect; /* dialect index that server chose */
 	enum securityEnum secType;
 	unsigned int maxReq;	/* Clients should submit no more */
 	/* than maxReq distinct unanswered SMBs to the server when using  */
@@ -202,28 +191,29 @@ struct TCP_Server_Info {
 	unsigned int max_vcs;	/* maximum number of smb sessions, at least
 				   those that can be specified uniquely with
 				   vcnumbers */
-	char sessid[4];		/* unique token id for this session */
-	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u16 CurrentMid;         /* multiplex id - rotating counter */
 	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-	__u32 sequence_number; /* needed for CIFS PDU signature */
+	__u32 sequence_number; /* for signing, protected by srv_mutex */
 	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
-	u16 dialect; /* dialect index that server chose */
 	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
 	/* extended security flavors that server supports */
+	bool	sec_ntlmssp;		/* supports NTLMSSP */
+	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
-	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
-	bool	sec_ntlmssp;		/* supports NTLMSSP */
-	bool session_estab; /* mark when very first sess is established */
+	struct delayed_work	echo; /* echo ping workqueue job */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
+#ifdef CONFIG_CIFS_STATS2
+	atomic_t inSend; /* requests trying to send */
+	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
 };
 
 /*
@@ -449,13 +439,14 @@ struct cifsInodeInfo {
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-	unsigned long time;	/* jiffies of last update/check of inode */
-	bool clientCanCacheRead:1;	/* read oplock */
-	bool clientCanCacheAll:1;	/* read and writebehind oplock */
-	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
-	bool invalid_mapping:1;		/* pagecache is invalid */
+	bool clientCanCacheRead;	/* read oplock */
+	bool clientCanCacheAll;		/* read and writebehind oplock */
+	bool delete_pending;		/* DELETE_ON_CLOSE is set */
+	bool invalid_mapping;		/* pagecache is invalid */
+	unsigned long time;		/* jiffies of last update of inode */
 	u64  server_eof;		/* current file size on server */
 	u64  uniqueid;			/* server inode number */
+	u64  createtime;		/* creation time on server */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie *fscache;
 #endif
@@ -510,6 +501,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
 
 #endif
 
+struct mid_q_entry;
+
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd
+ * - the GlobalMid_Lock will be held
+ * - the mid will be removed from the pending_mid_q list
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
+
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
 	struct list_head qhead;	/* mids waiting on reply from this server */
@@ -521,7 +524,8 @@ struct mid_q_entry {
 	unsigned long when_sent; /* time when smb send finished */
 	unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
-	struct task_struct *tsk;	/* task waiting for response */
+	mid_callback_t *callback; /* call completion callback */
+	void *callback_data;	  /* general purpose pointer for callback */
 	struct smb_hdr *resp_buf;	/* response buffer */
 	int midState;	/* wish this were enum but can not pass to wait_event */
 	__u8 command;	/* smb command code */
@@ -576,6 +580,7 @@ struct cifs_fattr {
 	u64		cf_uniqueid;
 	u64		cf_eof;
 	u64		cf_bytes;
+	u64		cf_createtime;
 	uid_t		cf_uid;
 	gid_t		cf_gid;
 	umode_t		cf_mode;
@@ -623,12 +628,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_IOVEC            4    /* array of response buffers */
 
 /* Type of Request to SendReceive2 */
-#define   CIFS_STD_OP	        0    /* normal request timeout */
-#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
-#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
-#define   CIFS_BLOCKING_OP      4    /* operation can block */
-#define   CIFS_ASYNC_OP         8    /* do not wait for response */
-#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
 #define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
 #define   CIFS_NO_RESP      0x040    /* no response buffer required */
@@ -791,6 +793,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
+
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a..b5c8cc5d7a7 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 
 #include <net/sock.h>
+#include <asm/unaligned.h>
 #include "smbfsctl.h"
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
 #define SMB_COM_SETATTR               0x09 /* trivial response */
 #define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
 #define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
 #define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
 #define SMB_COM_READ_ANDX             0x2E
 #define SMB_COM_WRITE_ANDX            0x2F
@@ -425,11 +427,49 @@ struct smb_hdr {
 	__u16 Mid;
 	__u8 WordCount;
 } __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
-#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+
+/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
+#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
+			 (2 * (smb_var)->WordCount))
+
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+
+/* get the converted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+	__u16 *bc_ptr = (__u16 *)BCC(hdr);
+
+	return get_unaligned(bc_ptr);
+}
+
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc_le(struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	return get_unaligned_le16(bc_ptr);
+}
+
+/* set the ByteCount for a SMB packet in host-byte order */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+	__u16 *bc_ptr = (__u16 *)BCC(hdr);
+
+	put_unaligned(count, bc_ptr);
+}
+
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc_le(__u16 count, struct smb_hdr *hdr)
+{
+	__le16 *bc_ptr = (__le16 *)BCC(hdr);
+
+	put_unaligned_le16(count, bc_ptr);
+}
 
 /*
  * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
  *
  */
 
+typedef struct smb_com_echo_req {
+	struct	smb_hdr hdr;
+	__le16	EchoCount;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_REQ;
+
+typedef struct smb_com_echo_rsp {
+	struct	smb_hdr hdr;
+	__le16	SequenceNumber;
+	__le16	ByteCount;
+	char	Data[1];
+} __attribute__((packed)) ECHO_RSP;
+
 typedef struct smb_com_logoff_andx_req {
 	struct smb_hdr hdr;	/* wct = 2 */
 	__u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e6d1481b16c..982895fa761 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,12 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
 		const char *fullpath, const struct dfs_info3_param *ref,
 		char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+					struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+			   struct smb_hdr *in_buf, mid_callback_t *callback,
+			   void *cbdata);
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct smb_hdr * /* input */ ,
 			struct smb_hdr * /* out */ ,
@@ -347,12 +353,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 netfid, const __u64 len,
 			const __u64 offset, const __u32 numUnlock,
 			const __u32 numLock, const __u8 lockType,
-			const bool waitFlag);
+			const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 smb_file_id, const int get_flag,
 			const __u64 len, struct file_lock *,
 			const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
 
 extern struct cifsSesInfo *sesInfoAlloc(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 67acfb3acad..3106f5e5c63 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -331,37 +331,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
 
 static int validate_t2(struct smb_t2_rsp *pSMB)
 {
-	int rc = -EINVAL;
-	int total_size;
-	char *pBCC;
+	unsigned int total_size;
+
+	/* check for plausible wct */
+	if (pSMB->hdr.WordCount < 10)
+		goto vt2_err;
 
-	/* check for plausible wct, bcc and t2 data and parm sizes */
 	/* check for parm and data offset going beyond end of smb */
-	if (pSMB->hdr.WordCount >= 10) {
-		if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) &&
-		   (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) {
-			/* check that bcc is at least as big as parms + data */
-			/* check that bcc is less than negotiated smb buffer */
-			total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount);
-			if (total_size < 512) {
-				total_size +=
-					le16_to_cpu(pSMB->t2_rsp.DataCount);
-				/* BCC le converted in SendReceive */
-				pBCC = (pSMB->hdr.WordCount * 2) +
-					sizeof(struct smb_hdr) +
-					(char *)pSMB;
-				if ((total_size <= (*(u16 *)pBCC)) &&
-				   (total_size <
-					CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) {
-					return 0;
-				}
-			}
-		}
-	}
+	if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
+	    get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
+		goto vt2_err;
+
+	/* check that bcc is at least as big as parms + data */
+	/* check that bcc is less than negotiated smb buffer */
+	total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
+	if (total_size >= 512)
+		goto vt2_err;
+
+	total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
+	if (total_size > get_bcc(&pSMB->hdr) ||
+	    total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
+		goto vt2_err;
+
+	return 0;
+vt2_err:
 	cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
 		sizeof(struct smb_t2_rsp) + 16);
-	return rc;
+	return -EINVAL;
 }
+
 int
 CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 {
@@ -401,15 +399,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
 		cFYI(1, "Kerberos only mechanism, enable extended security");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	}
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-	else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+	} else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
 		cFYI(1, "NTLMSSP only mechanism, enable extended security");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
-#endif
 
 	count = 0;
 	for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +450,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
 		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
 		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -569,7 +563,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
-	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
@@ -709,6 +702,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	return rc;
 }
 
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+	struct TCP_Server_Info *server = mid->callback_data;
+
+	DeleteMidQEntry(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+}
+
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+	ECHO_REQ *smb;
+	int rc = 0;
+
+	cFYI(1, "In echo request");
+
+	rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+	if (rc)
+		return rc;
+
+	/* set up echo request */
+	smb->hdr.Tid = cpu_to_le16(0xffff);
+	smb->hdr.WordCount = 1;
+	put_unaligned_le16(1, &smb->EchoCount);
+	put_bcc_le(1, &smb->hdr);
+	smb->Data[0] = 'a';
+	smb->hdr.smb_buf_length += 3;
+
+	rc = cifs_call_async(server, (struct smb_hdr *)smb,
+				cifs_echo_callback, server);
+	if (rc)
+		cFYI(1, "Echo request failed: %d", rc);
+
+	cifs_small_buf_release(smb);
+
+	return rc;
+}
+
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
@@ -1196,7 +1236,7 @@ OldOpenRetry:
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
 		cFYI(1, "Error in Open = %d", rc);
@@ -1309,7 +1349,7 @@ openRetry:
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
 		cFYI(1, "Error in Open = %d", rc);
@@ -1391,7 +1431,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	iov[0].iov_base = (char *)pSMB;
 	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-			 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
+			 &resp_buf_type, CIFS_LOG_ERROR);
 	cifs_stats_inc(&tcon->num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
@@ -1666,7 +1706,8 @@ int
 CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	    const __u16 smb_file_id, const __u64 len,
 	    const __u64 offset, const __u32 numUnlock,
-	    const __u32 numLock, const __u8 lockType, const bool waitFlag)
+	    const __u32 numLock, const __u8 lockType,
+	    const bool waitFlag, const __u8 oplock_level)
 {
 	int rc = 0;
 	LOCK_REQ *pSMB = NULL;
@@ -1694,6 +1735,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	pSMB->NumberOfLocks = cpu_to_le16(numLock);
 	pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
 	pSMB->LockType = lockType;
+	pSMB->OplockLevel = oplock_level;
 	pSMB->AndXCommand = 0xFF;	/* none */
 	pSMB->Fid = smb_file_id; /* netfid stays le */
 
@@ -3090,7 +3132,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
 
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-			 CIFS_STD_OP);
+			 0);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
 		cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -5565,7 +5607,7 @@ QAllEAsRetry:
 	}
 
 	/* make sure list_len doesn't go past end of SMB */
-	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
 	if ((char *)ea_response_data + list_len > end_of_smb) {
 		cFYI(1, "EA list appears to go beyond SMB");
 		rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index cc1a8604a79..18d3c7724d6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,6 +52,9 @@
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
 
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
@@ -64,8 +67,8 @@ struct smb_vol {
 	char *UNC;
 	char *UNCip;
 	char *iocharset;  /* local code page for mapping to and from Unicode */
-	char source_rfc1001_name[16]; /* netbios name of client */
-	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
+	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
 	uid_t cred_uid;
 	uid_t linux_uid;
 	gid_t linux_gid;
@@ -115,8 +118,8 @@ struct smb_vol {
 #define TLINK_ERROR_EXPIRE	(1 * HZ)
 #define TLINK_IDLE_EXPIRE	(600 * HZ)
 
-static int ipv4_connect(struct TCP_Server_Info *server);
-static int ipv6_connect(struct TCP_Server_Info *server);
+static int ip_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
 
@@ -152,6 +155,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
+	cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -163,7 +167,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		}
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
+
 	/* do not want to be sending data on a socket we are freeing */
+	cFYI(1, "%s: tearing down socket", __func__);
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
 		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -180,30 +186,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	kfree(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
+	server->lstrp = jiffies;
+	mutex_unlock(&server->srv_mutex);
 
+	/* mark submitted MIDs for retry and issue callback */
+	cFYI(1, "%s: issuing mid callbacks", __func__);
 	spin_lock(&GlobalMid_Lock);
-	list_for_each(tmp, &server->pending_mid_q) {
-		mid_entry = list_entry(tmp, struct
-					mid_q_entry,
-					qhead);
-		if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				/* Mark other intransit requests as needing
-				   retry so we do not immediately mark the
-				   session bad again (ie after we reconnect
-				   below) as they timeout too */
+	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		if (mid_entry->midState == MID_REQUEST_SUBMITTED)
 			mid_entry->midState = MID_RETRY_NEEDED;
-		}
+		list_del_init(&mid_entry->qhead);
+		mid_entry->callback(mid_entry);
 	}
 	spin_unlock(&GlobalMid_Lock);
-	mutex_unlock(&server->srv_mutex);
 
 	while ((server->tcpStatus != CifsExiting) &&
 	       (server->tcpStatus != CifsGood)) {
 		try_to_freeze();
-		if (server->addr.sockAddr6.sin6_family == AF_INET6)
-			rc = ipv6_connect(server);
-		else
-			rc = ipv4_connect(server);
+
+		/* we should try only the port we connected to before */
+		rc = generic_ip_connect(server);
 		if (rc) {
 			cFYI(1, "reconnect error %d", rc);
 			msleep(3000);
@@ -213,10 +216,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsGood;
 			spin_unlock(&GlobalMid_Lock);
-	/*		atomic_set(&server->inFlight,0);*/
-			wake_up(&server->response_q);
 		}
 	}
+
 	return rc;
 }
 
@@ -230,9 +232,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 {
 	struct smb_t2_rsp *pSMBt;
-	int total_data_size;
-	int data_in_this_rsp;
 	int remaining;
+	__u16 total_data_size, data_in_this_rsp;
 
 	if (pSMB->Command != SMB_COM_TRANSACTION2)
 		return 0;
@@ -246,8 +247,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 
 	pSMBt = (struct smb_t2_rsp *)pSMB;
 
-	total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
-	data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
 	remaining = total_data_size - data_in_this_rsp;
 
@@ -273,21 +274,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
 	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
 	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-	int total_data_size;
-	int total_in_buf;
-	int remaining;
-	int total_in_buf2;
 	char *data_area_of_target;
 	char *data_area_of_buf2;
-	__u16 byte_count;
+	int remaining;
+	__u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
 
-	total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
 
-	if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
+	if (total_data_size !=
+	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
 		cFYI(1, "total data size of primary and secondary t2 differ");
-	}
 
-	total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
 	remaining = total_data_size - total_in_buf;
 
@@ -297,28 +295,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	if (remaining == 0) /* nothing to do, ignore */
 		return 0;
 
-	total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
+	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
 	if (remaining < total_in_buf2) {
 		cFYI(1, "transact2 2nd response contains too much data");
 	}
 
 	/* find end of first SMB data area */
 	data_area_of_target = (char *)&pSMBt->hdr.Protocol +
-				le16_to_cpu(pSMBt->t2_rsp.DataOffset);
+				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
 	/* validate target area */
 
-	data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol +
-					le16_to_cpu(pSMB2->t2_rsp.DataOffset);
+	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
+				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
 
 	data_area_of_target += total_in_buf;
 
 	/* copy second buffer into end of first buffer */
 	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
 	total_in_buf += total_in_buf2;
-	pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf);
-	byte_count = le16_to_cpu(BCC_LE(pTargetSMB));
+	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+	byte_count = get_bcc_le(pTargetSMB);
 	byte_count += total_in_buf2;
-	BCC_LE(pTargetSMB) = cpu_to_le16(byte_count);
+	put_bcc_le(byte_count, pTargetSMB);
 
 	byte_count = pTargetSMB->smb_buf_length;
 	byte_count += total_in_buf2;
@@ -332,7 +330,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 		return 0; /* we are done */
 	} else /* more responses to go */
 		return 1;
+}
 
+static void
+cifs_echo_request(struct work_struct *work)
+{
+	int rc;
+	struct TCP_Server_Info *server = container_of(work,
+					struct TCP_Server_Info, echo.work);
+
+	/* no need to ping if we got a response recently */
+	if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+		goto requeue_echo;
+
+	rc = CIFSSMBEcho(server);
+	if (rc)
+		cFYI(1, "Unable to send echo request to server: %s",
+			server->hostname);
+
+requeue_echo:
+	queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
 }
 
 static int
@@ -346,8 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	struct msghdr smb_msg;
 	struct kvec iov;
 	struct socket *csocket = server->ssocket;
-	struct list_head *tmp;
-	struct cifsSesInfo *ses;
+	struct list_head *tmp, *tmp2;
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 	char temp;
@@ -400,7 +416,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		smb_msg.msg_control = NULL;
 		smb_msg.msg_controllen = 0;
 		pdu_length = 4; /* enough to get RFC1001 header */
+
 incomplete_rcv:
+		if (echo_retries > 0 &&
+		    time_after(jiffies, server->lstrp +
+					(echo_retries * SMB_ECHO_INTERVAL))) {
+			cERROR(1, "Server %s has not responded in %d seconds. "
+				  "Reconnecting...", server->hostname,
+				  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
+		}
+
 		length =
 		    kernel_recvmsg(csocket, &smb_msg,
 				&iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -477,7 +506,7 @@ incomplete_rcv:
 			 * initialize frame)
 			 */
 			cifs_set_port((struct sockaddr *)
-					&server->addr.sockAddr, CIFS_PORT);
+					&server->dstaddr, CIFS_PORT);
 			cifs_reconnect(server);
 			csocket = server->ssocket;
 			wake_up(&server->response_q);
@@ -560,10 +589,11 @@ incomplete_rcv:
 			continue;
 		}
 
+		mid_entry = NULL;
+		server->lstrp = jiffies;
 
-		task_to_wake = NULL;
 		spin_lock(&GlobalMid_Lock);
-		list_for_each(tmp, &server->pending_mid_q) {
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 
 			if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -604,20 +634,19 @@ incomplete_rcv:
 				mid_entry->resp_buf = smb_buffer;
 				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-				task_to_wake = mid_entry->tsk;
 				mid_entry->midState = MID_RESPONSE_RECEIVED;
+				list_del_init(&mid_entry->qhead);
+				mid_entry->callback(mid_entry);
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
-				/* so we do not time out requests to  server
-				which is still responding (since server could
-				be busy but not dead) */
-				server->lstrp = jiffies;
 				break;
 			}
+			mid_entry = NULL;
 		}
 		spin_unlock(&GlobalMid_Lock);
-		if (task_to_wake) {
+
+		if (mid_entry != NULL) {
 			/* Was previous buf put in mpx struct for multi-rsp? */
 			if (!isMultiRsp) {
 				/* smb buffer will be freed by user thread */
@@ -626,11 +655,10 @@ multi_t2_fnd:
 				else
 					smallbuf = NULL;
 			}
-			wake_up_process(task_to_wake);
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
 			cERROR(1, "No task to wake, unknown frame received! "
-				   "NumMids %d", midCount.counter);
+				   "NumMids %d", atomic_read(&midCount));
 			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
 				      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -678,44 +706,16 @@ multi_t2_fnd:
 	if (smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(smallbuf);
 
-	/*
-	 * BB: we shouldn't have to do any of this. It shouldn't be
-	 * possible to exit from the thread with active SMB sessions
-	 */
-	spin_lock(&cifs_tcp_ses_lock);
-	if (list_empty(&server->pending_mid_q)) {
-		/* loop through server session structures attached to this and
-		    mark them dead */
-		list_for_each(tmp, &server->smb_ses_list) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-					 smb_ses_list);
-			ses->status = CifsExiting;
-			ses->server = NULL;
-		}
-		spin_unlock(&cifs_tcp_ses_lock);
-	} else {
-		/* although we can not zero the server struct pointer yet,
-		since there are active requests which may depnd on them,
-		mark the corresponding SMB sessions as exiting too */
-		list_for_each(tmp, &server->smb_ses_list) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-					 smb_ses_list);
-			ses->status = CifsExiting;
-		}
-
+	if (!list_empty(&server->pending_mid_q)) {
 		spin_lock(&GlobalMid_Lock);
-		list_for_each(tmp, &server->pending_mid_q) {
-		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-			if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				cFYI(1, "Clearing Mid 0x%x - waking up ",
+		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			cFYI(1, "Clearing Mid 0x%x - issuing callback",
 					 mid_entry->mid);
-				task_to_wake = mid_entry->tsk;
-				if (task_to_wake)
-					wake_up_process(task_to_wake);
-			}
+			list_del_init(&mid_entry->qhead);
+			mid_entry->callback(mid_entry);
 		}
 		spin_unlock(&GlobalMid_Lock);
-		spin_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -733,18 +733,6 @@ multi_t2_fnd:
 		coming home not much else we can do but free the memory */
 	}
 
-	/* last chance to mark ses pointers invalid
-	if there are any pointing to this (e.g
-	if a crazy root user tried to kill cifsd
-	kernel thread explicitly this might happen) */
-	/* BB: This shouldn't be necessary, see above */
-	spin_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &server->smb_ses_list) {
-		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-		ses->server = NULL;
-	}
-	spin_unlock(&cifs_tcp_ses_lock);
-
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
 	kfree(server);
@@ -817,11 +805,11 @@ cifs_parse_mount_options(char *options, const char *devname,
 	 * informational, only used for servers that do not support
 	 * port 445 and it can be overridden at mount time
 	 */
-	memset(vol->source_rfc1001_name, 0x20, 15);
-	for (i = 0; i < strnlen(nodename, 15); i++)
+	memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
+	for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
 		vol->source_rfc1001_name[i] = toupper(nodename[i]);
 
-	vol->source_rfc1001_name[15] = 0;
+	vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
 	/* null target name indicates to use *SMBSERVR default called name
 	   if we end up sending RFC1001 session initialize */
 	vol->target_rfc1001_name[0] = 0;
@@ -985,13 +973,11 @@ cifs_parse_mount_options(char *options, const char *devname,
 				return 1;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 			} else if (strnicmp(value, "ntlmsspi", 8) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
 					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "ntlmssp", 7) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-#endif
 			} else if (strnicmp(value, "ntlmv2i", 7) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
 					CIFSSEC_MUST_SIGN;
@@ -1116,6 +1102,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (!strnicmp(data, "uid", 3) && value && *value) {
 			vol->linux_uid = simple_strtoul(value, &value, 0);
 			uid_specified = true;
+		} else if (!strnicmp(data, "cruid", 5) && value && *value) {
+			vol->cred_uid = simple_strtoul(value, &value, 0);
 		} else if (!strnicmp(data, "forceuid", 8)) {
 			override_uid = 1;
 		} else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1168,22 +1156,22 @@ cifs_parse_mount_options(char *options, const char *devname,
 			if (!value || !*value || (*value == ' ')) {
 				cFYI(1, "invalid (empty) netbiosname");
 			} else {
-				memset(vol->source_rfc1001_name, 0x20, 15);
-				for (i = 0; i < 15; i++) {
-				/* BB are there cases in which a comma can be
-				valid in this workstation netbios name (and need
-				special handling)? */
-
-				/* We do not uppercase netbiosname for user */
+				memset(vol->source_rfc1001_name, 0x20,
+					RFC1001_NAME_LEN);
+				/*
+				 * FIXME: are there cases in which a comma can
+				 * be valid in workstation netbios name (and
+				 * need special handling)?
+				 */
+				for (i = 0; i < RFC1001_NAME_LEN; i++) {
+					/* don't ucase netbiosname for user */
 					if (value[i] == 0)
 						break;
-					else
-						vol->source_rfc1001_name[i] =
-								value[i];
+					vol->source_rfc1001_name[i] = value[i];
 				}
 				/* The string has 16th byte zero still from
 				set at top of the function  */
-				if ((i == 15) && (value[i] != 0))
+				if (i == RFC1001_NAME_LEN && value[i] != 0)
 					printk(KERN_WARNING "CIFS: netbiosname"
 						" longer than 15 truncated.\n");
 			}
@@ -1193,7 +1181,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 				cFYI(1, "empty server netbiosname specified");
 			} else {
 				/* last byte, type, is 0x20 for servr type */
-				memset(vol->target_rfc1001_name, 0x20, 16);
+				memset(vol->target_rfc1001_name, 0x20,
+					RFC1001_NAME_LEN_WITH_NULL);
 
 				for (i = 0; i < 15; i++) {
 				/* BB are there cases in which a comma can be
@@ -1210,7 +1199,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 				}
 				/* The string has 16th byte zero still from
 				   set at top of the function  */
-				if ((i == 15) && (value[i] != 0))
+				if (i == RFC1001_NAME_LEN && value[i] != 0)
 					printk(KERN_WARNING "CIFS: server net"
 					"biosname longer than 15 truncated.\n");
 			}
@@ -1341,10 +1330,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->no_psx_acl = 0;
 		} else if (strnicmp(data, "noacl", 5) == 0) {
 			vol->no_psx_acl = 1;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 		} else if (strnicmp(data, "locallease", 6) == 0) {
 			vol->local_lease = 1;
-#endif
 		} else if (strnicmp(data, "sign", 4) == 0) {
 			vol->secFlg |= CIFSSEC_MUST_SIGN;
 		} else if (strnicmp(data, "seal", 4) == 0) {
@@ -1454,35 +1441,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
 	}
 }
 
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+	unsigned short int port, *sport;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+		port = ((struct sockaddr_in *) addr)->sin_port;
+		break;
+	case AF_INET6:
+		sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+		port = ((struct sockaddr_in6 *) addr)->sin6_port;
+		break;
+	default:
+		WARN_ON(1);
+		return false;
+	}
+
+	if (!port) {
+		port = htons(CIFS_PORT);
+		if (port == *sport)
+			return true;
+
+		port = htons(RFC1001_PORT);
+	}
+
+	return port == *sport;
+}
 
 static bool
 match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
 	      struct sockaddr *srcaddr)
 {
-	struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
-
 	switch (addr->sa_family) {
-	case AF_INET:
-		if (addr4->sin_addr.s_addr !=
-		    server->addr.sockAddr.sin_addr.s_addr)
-			return false;
-		if (addr4->sin_port &&
-		    addr4->sin_port != server->addr.sockAddr.sin_port)
+	case AF_INET: {
+		struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+		struct sockaddr_in *srv_addr4 =
+					(struct sockaddr_in *)&server->dstaddr;
+
+		if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
 			return false;
 		break;
-	case AF_INET6:
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+		struct sockaddr_in6 *srv_addr6 =
+					(struct sockaddr_in6 *)&server->dstaddr;
+
 		if (!ipv6_addr_equal(&addr6->sin6_addr,
-				     &server->addr.sockAddr6.sin6_addr))
-			return false;
-		if (addr6->sin6_scope_id !=
-		    server->addr.sockAddr6.sin6_scope_id)
+				     &srv_addr6->sin6_addr))
 			return false;
-		if (addr6->sin6_port &&
-		    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
+		if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
 			return false;
 		break;
 	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
 
 	if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
 		return false;
@@ -1549,6 +1572,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 				   (struct sockaddr *)&vol->srcaddr))
 			continue;
 
+		if (!match_port(server, addr))
+			continue;
+
 		if (!match_security(server, vol))
 			continue;
 
@@ -1575,6 +1601,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 	list_del_init(&server->tcp_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
+	cancel_delayed_work_sync(&server->echo);
+
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
@@ -1664,8 +1692,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
 	tcp_ses->session_estab = false;
 	tcp_ses->sequence_number = 0;
+	tcp_ses->lstrp = jiffies;
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
 
 	/*
 	 * at this point we are the only ones with the pointer
@@ -1681,14 +1711,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		cFYI(1, "attempting ipv6 connect");
 		/* BB should we allow ipv6 on port 139? */
 		/* other OS never observed in Wild doing 139 with v6 */
-		memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
-			sizeof(struct sockaddr_in6));
-		rc = ipv6_connect(tcp_ses);
-	} else {
-		memcpy(&tcp_ses->addr.sockAddr, sin_server,
-			sizeof(struct sockaddr_in));
-		rc = ipv4_connect(tcp_ses);
-	}
+		memcpy(&tcp_ses->dstaddr, sin_server6,
+		       sizeof(struct sockaddr_in6));
+	} else
+		memcpy(&tcp_ses->dstaddr, sin_server,
+		       sizeof(struct sockaddr_in));
+
+	rc = ip_connect(tcp_ses);
 	if (rc < 0) {
 		cERROR(1, "Error connecting to socket. Aborting operation");
 		goto out_err_crypto_release;
@@ -1715,6 +1744,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	cifs_fscache_get_client_cookie(tcp_ses);
 
+	/* queue echo request delayed work */
+	queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+
 	return tcp_ses;
 
 out_err_crypto_release:
@@ -1793,6 +1825,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
 	int rc = -ENOMEM, xid;
 	struct cifsSesInfo *ses;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
 
 	xid = GetXid();
 
@@ -1836,12 +1870,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 
 	/* new SMB session uses our server ref */
 	ses->server = server;
-	if (server->addr.sockAddr6.sin6_family == AF_INET6)
-		sprintf(ses->serverName, "%pI6",
-			&server->addr.sockAddr6.sin6_addr);
+	if (server->dstaddr.ss_family == AF_INET6)
+		sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
 	else
-		sprintf(ses->serverName, "%pI4",
-			&server->addr.sockAddr.sin_addr.s_addr);
+		sprintf(ses->serverName, "%pI4", &addr->sin_addr);
 
 	if (volume_info->username)
 		strncpy(ses->userName, volume_info->username,
@@ -2136,19 +2168,106 @@ bind_socket(struct TCP_Server_Info *server)
 }
 
 static int
-ipv4_connect(struct TCP_Server_Info *server)
+ip_rfc1001_connect(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	/*
+	 * some servers require RFC1001 sessinit before sending
+	 * negprot - BB check reconnection in case where second
+	 * sessinit is sent but no second negprot
+	 */
+	struct rfc1002_session_packet *ses_init_buf;
+	struct smb_hdr *smb_buf;
+	ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+			       GFP_KERNEL);
+	if (ses_init_buf) {
+		ses_init_buf->trailer.session_req.called_len = 32;
+
+		if (server->server_RFC1001_name &&
+		    server->server_RFC1001_name[0] != 0)
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.called_name,
+				      server->server_RFC1001_name,
+				      RFC1001_NAME_LEN_WITH_NULL);
+		else
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.called_name,
+				      DEFAULT_CIFS_CALLED_NAME,
+				      RFC1001_NAME_LEN_WITH_NULL);
+
+		ses_init_buf->trailer.session_req.calling_len = 32;
+
+		/*
+		 * calling name ends in null (byte 16) from old smb
+		 * convention.
+		 */
+		if (server->workstation_RFC1001_name &&
+		    server->workstation_RFC1001_name[0] != 0)
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.calling_name,
+				      server->workstation_RFC1001_name,
+				      RFC1001_NAME_LEN_WITH_NULL);
+		else
+			rfc1002mangle(ses_init_buf->trailer.
+				      session_req.calling_name,
+				      "LINUX_CIFS_CLNT",
+				      RFC1001_NAME_LEN_WITH_NULL);
+
+		ses_init_buf->trailer.session_req.scope1 = 0;
+		ses_init_buf->trailer.session_req.scope2 = 0;
+		smb_buf = (struct smb_hdr *)ses_init_buf;
+
+		/* sizeof RFC1002_SESSION_REQUEST with no scope */
+		smb_buf->smb_buf_length = 0x81000044;
+		rc = smb_send(server, smb_buf, 0x44);
+		kfree(ses_init_buf);
+		/*
+		 * RFC1001 layer in at least one server
+		 * requires very short break before negprot
+		 * presumably because not expecting negprot
+		 * to follow so fast.  This is a simple
+		 * solution that works without
+		 * complicating the code and causes no
+		 * significant slowing down on mount
+		 * for everyone else
+		 */
+		usleep_range(1000, 2000);
+	}
+	/*
+	 * else the negprot may still work without this
+	 * even though malloc failed
+	 */
+
+	return rc;
+}
+
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
-	int val;
-	bool connected = false;
-	__be16 orig_port = 0;
+	unsigned short int sport;
+	int slen, sfamily;
 	struct socket *socket = server->ssocket;
+	struct sockaddr *saddr;
+
+	saddr = (struct sockaddr *) &server->dstaddr;
+
+	if (server->dstaddr.ss_family == AF_INET6) {
+		sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+		slen = sizeof(struct sockaddr_in6);
+		sfamily = AF_INET6;
+	} else {
+		sport = ((struct sockaddr_in *) saddr)->sin_port;
+		slen = sizeof(struct sockaddr_in);
+		sfamily = AF_INET;
+	}
 
 	if (socket == NULL) {
-		rc = sock_create_kern(PF_INET, SOCK_STREAM,
+		rc = sock_create_kern(sfamily, SOCK_STREAM,
 				      IPPROTO_TCP, &socket);
 		if (rc < 0) {
 			cERROR(1, "Error %d creating socket", rc);
+			server->ssocket = NULL;
 			return rc;
 		}
 
@@ -2156,63 +2275,28 @@ ipv4_connect(struct TCP_Server_Info *server)
 		cFYI(1, "Socket created");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
-		cifs_reclassify_socket4(socket);
+		if (sfamily == AF_INET6)
+			cifs_reclassify_socket6(socket);
+		else
+			cifs_reclassify_socket4(socket);
 	}
 
 	rc = bind_socket(server);
 	if (rc < 0)
 		return rc;
 
-	/* user overrode default port */
-	if (server->addr.sockAddr.sin_port) {
-		rc = socket->ops->connect(socket, (struct sockaddr *)
-					  &server->addr.sockAddr,
-					  sizeof(struct sockaddr_in), 0);
-		if (rc >= 0)
-			connected = true;
-	}
-
-	if (!connected) {
-		/* save original port so we can retry user specified port
-			later if fall back ports fail this time  */
-		orig_port = server->addr.sockAddr.sin_port;
-
-		/* do not retry on the same port we just failed on */
-		if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-			server->addr.sockAddr.sin_port = htons(CIFS_PORT);
-			rc = socket->ops->connect(socket,
-						(struct sockaddr *)
-						&server->addr.sockAddr,
-						sizeof(struct sockaddr_in), 0);
-			if (rc >= 0)
-				connected = true;
-		}
-	}
-	if (!connected) {
-		server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-		rc = socket->ops->connect(socket, (struct sockaddr *)
-					      &server->addr.sockAddr,
-					      sizeof(struct sockaddr_in), 0);
-		if (rc >= 0)
-			connected = true;
-	}
-
-	/* give up here - unless we want to retry on different
-		protocol families some day */
-	if (!connected) {
-		if (orig_port)
-			server->addr.sockAddr.sin_port = orig_port;
-		cFYI(1, "Error %d connecting to server via ipv4", rc);
+	rc = socket->ops->connect(socket, saddr, slen, 0);
+	if (rc < 0) {
+		cFYI(1, "Error %d connecting to server", rc);
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
 	}
 
-
 	/*
 	 * Eventually check for other socket options to change from
-	 *  the default. sock_setsockopt not used because it expects
-	 *  user space buffer
+	 * the default. sock_setsockopt not used because it expects
+	 * user space buffer
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
 	socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2226,7 +2310,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 	}
 
 	if (server->tcp_nodelay) {
-		val = 1;
+		int val = 1;
 		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
 				(char *)&val, sizeof(val));
 		if (rc)
@@ -2237,161 +2321,39 @@ ipv4_connect(struct TCP_Server_Info *server)
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
-	/* send RFC1001 sessinit */
-	if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
-		/* some servers require RFC1001 sessinit before sending
-		negprot - BB check reconnection in case where second
-		sessinit is sent but no second negprot */
-		struct rfc1002_session_packet *ses_init_buf;
-		struct smb_hdr *smb_buf;
-		ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
-				       GFP_KERNEL);
-		if (ses_init_buf) {
-			ses_init_buf->trailer.session_req.called_len = 32;
-			if (server->server_RFC1001_name &&
-			    server->server_RFC1001_name[0] != 0)
-				rfc1002mangle(ses_init_buf->trailer.
-						session_req.called_name,
-					      server->server_RFC1001_name,
-					      RFC1001_NAME_LEN_WITH_NULL);
-			else
-				rfc1002mangle(ses_init_buf->trailer.
-						session_req.called_name,
-					      DEFAULT_CIFS_CALLED_NAME,
-					      RFC1001_NAME_LEN_WITH_NULL);
-
-			ses_init_buf->trailer.session_req.calling_len = 32;
-
-			/* calling name ends in null (byte 16) from old smb
-			convention. */
-			if (server->workstation_RFC1001_name &&
-			    server->workstation_RFC1001_name[0] != 0)
-				rfc1002mangle(ses_init_buf->trailer.
-						session_req.calling_name,
-					      server->workstation_RFC1001_name,
-					      RFC1001_NAME_LEN_WITH_NULL);
-			else
-				rfc1002mangle(ses_init_buf->trailer.
-						session_req.calling_name,
-					      "LINUX_CIFS_CLNT",
-					      RFC1001_NAME_LEN_WITH_NULL);
-
-			ses_init_buf->trailer.session_req.scope1 = 0;
-			ses_init_buf->trailer.session_req.scope2 = 0;
-			smb_buf = (struct smb_hdr *)ses_init_buf;
-			/* sizeof RFC1002_SESSION_REQUEST with no scope */
-			smb_buf->smb_buf_length = 0x81000044;
-			rc = smb_send(server, smb_buf, 0x44);
-			kfree(ses_init_buf);
-			msleep(1); /* RFC1001 layer in at least one server
-				      requires very short break before negprot
-				      presumably because not expecting negprot
-				      to follow so fast.  This is a simple
-				      solution that works without
-				      complicating the code and causes no
-				      significant slowing down on mount
-				      for everyone else */
-		}
-		/* else the negprot may still work without this
-		even though malloc failed */
-
-	}
+	if (sport == htons(RFC1001_PORT))
+		rc = ip_rfc1001_connect(server);
 
 	return rc;
 }
 
 static int
-ipv6_connect(struct TCP_Server_Info *server)
+ip_connect(struct TCP_Server_Info *server)
 {
-	int rc = 0;
-	int val;
-	bool connected = false;
-	__be16 orig_port = 0;
-	struct socket *socket = server->ssocket;
+	unsigned short int *sport;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
 
-	if (socket == NULL) {
-		rc = sock_create_kern(PF_INET6, SOCK_STREAM,
-				      IPPROTO_TCP, &socket);
-		if (rc < 0) {
-			cERROR(1, "Error %d creating ipv6 socket", rc);
-			socket = NULL;
-			return rc;
-		}
+	if (server->dstaddr.ss_family == AF_INET6)
+		sport = &addr6->sin6_port;
+	else
+		sport = &addr->sin_port;
 
-		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, "ipv6 Socket created");
-		server->ssocket = socket;
-		socket->sk->sk_allocation = GFP_NOFS;
-		cifs_reclassify_socket6(socket);
-	}
+	if (*sport == 0) {
+		int rc;
 
-	rc = bind_socket(server);
-	if (rc < 0)
-		return rc;
+		/* try with 445 port at first */
+		*sport = htons(CIFS_PORT);
 
-	/* user overrode default port */
-	if (server->addr.sockAddr6.sin6_port) {
-		rc = socket->ops->connect(socket,
-				(struct sockaddr *) &server->addr.sockAddr6,
-				sizeof(struct sockaddr_in6), 0);
+		rc = generic_ip_connect(server);
 		if (rc >= 0)
-			connected = true;
-	}
-
-	if (!connected) {
-		/* save original port so we can retry user specified port
-			later if fall back ports fail this time  */
-
-		orig_port = server->addr.sockAddr6.sin6_port;
-		/* do not retry on the same port we just failed on */
-		if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-			server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
-			rc = socket->ops->connect(socket, (struct sockaddr *)
-					&server->addr.sockAddr6,
-					sizeof(struct sockaddr_in6), 0);
-			if (rc >= 0)
-				connected = true;
-		}
-	}
-	if (!connected) {
-		server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-		rc = socket->ops->connect(socket, (struct sockaddr *)
-				&server->addr.sockAddr6,
-				sizeof(struct sockaddr_in6), 0);
-		if (rc >= 0)
-			connected = true;
-	}
-
-	/* give up here - unless we want to retry on different
-		protocol families some day */
-	if (!connected) {
-		if (orig_port)
-			server->addr.sockAddr6.sin6_port = orig_port;
-		cFYI(1, "Error %d connecting to server via ipv6", rc);
-		sock_release(socket);
-		server->ssocket = NULL;
-		return rc;
-	}
-
-	/*
-	 * Eventually check for other socket options to change from
-	 * the default. sock_setsockopt not used because it expects
-	 * user space buffer
-	 */
-	socket->sk->sk_rcvtimeo = 7 * HZ;
-	socket->sk->sk_sndtimeo = 5 * HZ;
+			return rc;
 
-	if (server->tcp_nodelay) {
-		val = 1;
-		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-				(char *)&val, sizeof(val));
-		if (rc)
-			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+		/* if it failed, try with 139 port */
+		*sport = htons(RFC1001_PORT);
 	}
 
-	server->ssocket = socket;
-
-	return rc;
+	return generic_ip_connect(server);
 }
 
 void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2970,8 +2932,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	TCONX_RSP *pSMBr;
 	unsigned char *bcc_ptr;
 	int rc = 0;
-	int length, bytes_left;
-	__u16 count;
+	int length;
+	__u16 bytes_left, count;
 
 	if (ses == NULL)
 		return -EIO;
@@ -2999,7 +2961,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		bcc_ptr++;              /* skip password */
 		/* already aligned so no need to do it below */
 	} else {
-		pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+		pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 		/* BB FIXME add code to fail this if NTLMv2 or Kerberos
 		   specified as required (when that support is added to
 		   the vfs in the future) as only NTLM or the much
@@ -3017,7 +2979,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #endif /* CIFS_WEAK_PW_HASH */
 		SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
 
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
 			/* must align unicode strings */
 			*bcc_ptr = 0; /* null byte password */
@@ -3055,7 +3017,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
-			 CIFS_STD_OP);
+			 0);
 
 	/* above now done in SendReceive */
 	if ((rc == 0) && (tcon != NULL)) {
@@ -3065,7 +3027,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		tcon->need_reconnect = false;
 		tcon->tid = smb_buffer_response->Tid;
 		bcc_ptr = pByteArea(smb_buffer_response);
-		bytes_left = BCC(smb_buffer_response);
+		bytes_left = get_bcc(smb_buffer_response);
 		length = strnlen(bcc_ptr, bytes_left - 2);
 		if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
 			is_unicode = true;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index db2a58c00f7..dd5f22918c3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,17 +130,6 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
-static void setup_cifs_dentry(struct cifsTconInfo *tcon,
-			      struct dentry *direntry,
-			      struct inode *newinode)
-{
-	if (tcon->nocase)
-		d_set_d_op(direntry, &cifs_ci_dentry_ops);
-	else
-		d_set_d_op(direntry, &cifs_dentry_ops);
-	d_instantiate(direntry, newinode);
-}
-
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 
 int
@@ -293,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			args.uid = NO_CHANGE_64;
 			args.gid = NO_CHANGE_64;
 		}
-		CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
+					current->tgid);
 	} else {
 		/* BB implement mode setting via Windows security
 		   descriptors e.g. */
@@ -329,7 +316,7 @@ cifs_create_get_file_info:
 
 cifs_create_set_dentry:
 	if (rc == 0)
-		setup_cifs_dentry(tcon, direntry, newinode);
+		d_instantiate(direntry, newinode);
 	else
 		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
 
@@ -420,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 
 		rc = cifs_get_inode_info_unix(&newinode, full_path,
 						inode->i_sb, xid);
-		if (pTcon->nocase)
-			d_set_d_op(direntry, &cifs_ci_dentry_ops);
-		else
-			d_set_d_op(direntry, &cifs_dentry_ops);
 
 		if (rc == 0)
 			d_instantiate(direntry, newinode);
@@ -603,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 				parent_dir_inode->i_sb, xid, NULL);
 
 	if ((rc == 0) && (newInode != NULL)) {
-		if (pTcon->nocase)
-			d_set_d_op(direntry, &cifs_ci_dentry_ops);
-		else
-			d_set_d_op(direntry, &cifs_dentry_ops);
 		d_add(direntry, newInode);
 		if (posix_open) {
 			filp = lookup_instantiate_filp(nd, direntry,
@@ -633,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	} else if (rc == -ENOENT) {
 		rc = 0;
 		direntry->d_time = jiffies;
-		if (pTcon->nocase)
-			d_set_d_op(direntry, &cifs_ci_dentry_ops);
-		else
-			d_set_d_op(direntry, &cifs_dentry_ops);
 		d_add(direntry, NULL);
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
@@ -700,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 
 const struct dentry_operations cifs_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
+	.d_automount = cifs_dfs_d_automount,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
@@ -736,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
 	.d_hash = cifs_ci_hash,
 	.d_compare = cifs_ci_compare,
+	.d_automount = cifs_dfs_d_automount,
 };
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a28660ca2b..d7d65a70678 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,53 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
 		return FILE_OPEN;
 }
 
-static inline int cifs_open_inode_helper(struct inode *inode,
-	struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
-	char *full_path, int xid)
-{
-	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-	struct timespec temp;
-	int rc;
-
-	if (pCifsInode->clientCanCacheRead) {
-		/* we have the inode open somewhere else
-		   no need to discard cache data */
-		goto client_can_cache;
-	}
-
-	/* BB need same check in cifs_create too? */
-	/* if not oplocked, invalidate inode pages if mtime or file
-	   size changed */
-	temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-	if (timespec_equal(&inode->i_mtime, &temp) &&
-			   (inode->i_size ==
-			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, "inode unchanged on server");
-	} else {
-		if (inode->i_mapping) {
-			/* BB no need to lock inode until after invalidate
-			since namei code should already have it locked? */
-			rc = filemap_write_and_wait(inode->i_mapping);
-			mapping_set_error(inode->i_mapping, rc);
-		}
-		cFYI(1, "invalidating remote inode since open detected it "
-			 "changed");
-		invalidate_remote_inode(inode);
-	}
-
-client_can_cache:
-	if (pTcon->unix_ext)
-		rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
-					      xid);
-	else
-		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-					 xid, NULL);
-
-	cifs_set_oplock_level(pCifsInode, oplock);
-
-	return rc;
-}
-
 int cifs_posix_open(char *full_path, struct inode **pinode,
 			struct super_block *sb, int mode, unsigned int f_flags,
 			__u32 *poplock, __u16 *pnetfid, int xid)
@@ -213,6 +166,76 @@ posix_open_ret:
 	return rc;
 }
 
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+	     struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+	     __u16 *pnetfid, int xid)
+{
+	int rc;
+	int desiredAccess;
+	int disposition;
+	FILE_ALL_INFO *buf;
+
+	desiredAccess = cifs_convert_flags(f_flags);
+
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *	POSIX Flag            CIFS Disposition
+ *	----------            ----------------
+ *	O_CREAT               FILE_OPEN_IF
+ *	O_CREAT | O_EXCL      FILE_CREATE
+ *	O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *	O_TRUNC               FILE_OVERWRITE
+ *	none of the above     FILE_OPEN
+ *
+ *	Note that there is not a direct match between disposition
+ *	FILE_SUPERSEDE (ie create whether or not file exists although
+ *	O_CREAT | O_TRUNC is similar but truncates the existing
+ *	file rather than creating a new file as FILE_SUPERSEDE does
+ *	(which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *	 O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+
+	disposition = cifs_get_disposition(f_flags);
+
+	/* BB pass O_SYNC flag through on file attributes .. BB */
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+			 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+			desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+				& CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc)
+		goto out;
+
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+					      xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+					 xid, pnetfid);
+
+out:
+	kfree(buf);
+	return rc;
+}
+
 struct cifsFileInfo *
 cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 		  struct tcon_link *tlink, __u32 oplock)
@@ -264,6 +287,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	struct inode *inode = cifs_file->dentry->d_inode;
 	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsLockInfo *li, *tmp;
 
 	spin_lock(&cifs_file_list_lock);
@@ -279,6 +303,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	if (list_empty(&cifsi->openFileList)) {
 		cFYI(1, "closing last open instance for inode %p",
 			cifs_file->dentry->d_inode);
+
+		/* in strict cache mode we need invalidate mapping on the last
+		   close  because it may cause a error when we open this file
+		   again and get at least level II oplock */
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+			CIFS_I(inode)->invalid_mapping = true;
+
 		cifs_set_oplock_level(cifsi, 0);
 	}
 	spin_unlock(&cifs_file_list_lock);
@@ -317,10 +348,8 @@ int cifs_open(struct inode *inode, struct file *file)
 	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	char *full_path = NULL;
-	int desiredAccess;
-	int disposition;
+	bool posix_open_ok = false;
 	__u16 netfid;
-	FILE_ALL_INFO *buf = NULL;
 
 	xid = GetXid();
 
@@ -358,17 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				file->f_flags, &oplock, &netfid, xid);
 		if (rc == 0) {
 			cFYI(1, "posix open succeeded");
-
-			pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
-						      oplock);
-			if (pCifsFile == NULL) {
-				CIFSSMBClose(xid, tcon, netfid);
-				rc = -ENOMEM;
-			}
-
-			cifs_fscache_set_inode_cookie(inode, file);
-
-			goto out;
+			posix_open_ok = true;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
 				cERROR(1, "server %s of type %s returned"
@@ -385,103 +404,39 @@ int cifs_open(struct inode *inode, struct file *file)
 		   or DFS errors */
 	}
 
-	desiredAccess = cifs_convert_flags(file->f_flags);
-
-/*********************************************************************
- *  open flag mapping table:
- *
- *	POSIX Flag            CIFS Disposition
- *	----------            ----------------
- *	O_CREAT               FILE_OPEN_IF
- *	O_CREAT | O_EXCL      FILE_CREATE
- *	O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
- *	O_TRUNC               FILE_OVERWRITE
- *	none of the above     FILE_OPEN
- *
- *	Note that there is not a direct match between disposition
- *	FILE_SUPERSEDE (ie create whether or not file exists although
- *	O_CREAT | O_TRUNC is similar but truncates the existing
- *	file rather than creating a new file as FILE_SUPERSEDE does
- *	(which uses the attributes / metadata passed in on open call)
- *?
- *?  O_SYNC is a reasonable match to CIFS writethrough flag
- *?  and the read write flags match reasonably.  O_LARGEFILE
- *?  is irrelevant because largefile support is always used
- *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
- *	 O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
- *********************************************************************/
-
-	disposition = cifs_get_disposition(file->f_flags);
-
-	/* BB pass O_SYNC flag through on file attributes .. BB */
-
-	/* Also refresh inode by passing in file_info buf returned by SMBOpen
-	   and calling get_inode_info with returned buf (at least helps
-	   non-Unix server case) */
-
-	/* BB we can not do this if this is the second open of a file
-	   and the first handle has writebehind data, we might be
-	   able to simply do a filemap_fdatawrite/filemap_fdatawait first */
-	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-	if (!buf) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	if (tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	else
-		rc = -EIO; /* no NT SMB support fall into legacy open below */
-
-	if (rc == -EIO) {
-		/* Old server, try legacy style OpenX */
-		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-			desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-				& CIFS_MOUNT_MAP_SPECIAL_CHR);
-	}
-	if (rc) {
-		cFYI(1, "cifs_open returned 0x%x", rc);
-		goto out;
+	if (!posix_open_ok) {
+		rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
+				  file->f_flags, &oplock, &netfid, xid);
+		if (rc)
+			goto out;
 	}
 
-	rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
-	if (rc != 0)
-		goto out;
-
 	pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
 	if (pCifsFile == NULL) {
+		CIFSSMBClose(xid, tcon, netfid);
 		rc = -ENOMEM;
 		goto out;
 	}
 
 	cifs_fscache_set_inode_cookie(inode, file);
 
-	if (oplock & CIFS_CREATE_ACTION) {
+	if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
-		if (tcon->unix_ext) {
-			struct cifs_unix_set_info_args args = {
-				.mode	= inode->i_mode,
-				.uid	= NO_CHANGE_64,
-				.gid	= NO_CHANGE_64,
-				.ctime	= NO_CHANGE_64,
-				.atime	= NO_CHANGE_64,
-				.mtime	= NO_CHANGE_64,
-				.device	= 0,
-			};
-			CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-					       cifs_sb->local_nls,
-					       cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-		}
+		struct cifs_unix_set_info_args args = {
+			.mode	= inode->i_mode,
+			.uid	= NO_CHANGE_64,
+			.gid	= NO_CHANGE_64,
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= 0,
+		};
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
+					pCifsFile->pid);
 	}
 
 out:
-	kfree(buf);
 	kfree(full_path);
 	FreeXid(xid);
 	cifs_put_tlink(tlink);
@@ -779,12 +734,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 
 		/* BB we could chain these into one lock request BB */
 		rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-				 0, 1, lockType, 0 /* wait flag */ );
+				 0, 1, lockType, 0 /* wait flag */, 0);
 		if (rc == 0) {
 			rc = CIFSSMBLock(xid, tcon, netfid, length,
 					 pfLock->fl_start, 1 /* numUnlock */ ,
 					 0 /* numLock */ , lockType,
-					 0 /* wait flag */ );
+					 0 /* wait flag */, 0);
 			pfLock->fl_type = F_UNLCK;
 			if (rc != 0)
 				cERROR(1, "Error unlocking previously locked "
@@ -801,13 +756,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 				rc = CIFSSMBLock(xid, tcon, netfid, length,
 					pfLock->fl_start, 0, 1,
 					lockType | LOCKING_ANDX_SHARED_LOCK,
-					0 /* wait flag */);
+					0 /* wait flag */, 0);
 				if (rc == 0) {
 					rc = CIFSSMBLock(xid, tcon, netfid,
 						length, pfLock->fl_start, 1, 0,
 						lockType |
 						LOCKING_ANDX_SHARED_LOCK,
-						0 /* wait flag */);
+						0 /* wait flag */, 0);
 					pfLock->fl_type = F_RDLCK;
 					if (rc != 0)
 						cERROR(1, "Error unlocking "
@@ -850,8 +805,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 
 		if (numLock) {
 			rc = CIFSSMBLock(xid, tcon, netfid, length,
-					pfLock->fl_start,
-					0, numLock, lockType, wait_flag);
+					 pfLock->fl_start, 0, numLock, lockType,
+					 wait_flag, 0);
 
 			if (rc == 0) {
 				/* For Windows locks we must store them. */
@@ -871,9 +826,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 						(pfLock->fl_start + length) >=
 						(li->offset + li->length)) {
 					stored_rc = CIFSSMBLock(xid, tcon,
-							netfid,
-							li->length, li->offset,
-							1, 0, li->type, false);
+							netfid, li->length,
+							li->offset, 1, 0,
+							li->type, false, 0);
 					if (stored_rc)
 						rc = stored_rc;
 					else {
@@ -892,29 +847,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	return rc;
 }
 
-/*
- * Set the timeout on write requests past EOF. For some servers (Windows)
- * these calls can be very long.
- *
- * If we're writing >10M past the EOF we give a 180s timeout. Anything less
- * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
- * The 10M cutoff is totally arbitrary. A better scheme for this would be
- * welcome if someone wants to suggest one.
- *
- * We may be able to do a better job with this if there were some way to
- * declare that a file should be sparse.
- */
-static int
-cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
-{
-	if (offset <= cifsi->server_eof)
-		return CIFS_STD_OP;
-	else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
-		return CIFS_VLONG_OP;
-	else
-		return CIFS_LONG_OP;
-}
-
 /* update the file size (if needed) after a write */
 static void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
@@ -935,7 +867,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	unsigned int total_written;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
-	int xid, long_op;
+	int xid;
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
@@ -956,7 +888,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	xid = GetXid();
 
-	long_op = cifs_write_timeout(cifsi, *poffset);
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
@@ -984,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 				min_t(const int, cifs_sb->wsize,
 				      write_size - total_written),
 				*poffset, &bytes_written,
-				NULL, write_data + total_written, long_op);
+				NULL, write_data + total_written, 0);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
@@ -997,8 +928,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			cifs_update_eof(cifsi, *poffset, bytes_written);
 			*poffset += bytes_written;
 		}
-		long_op = CIFS_STD_OP; /* subsequent writes fast -
-				    15 seconds is plenty */
 	}
 
 	cifs_stats_bytes_written(pTcon, total_written);
@@ -1027,7 +956,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 	unsigned int total_written;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
-	int xid, long_op;
+	int xid;
 	struct dentry *dentry = open_file->dentry;
 	struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
 
@@ -1040,7 +969,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 
 	xid = GetXid();
 
-	long_op = cifs_write_timeout(cifsi, *poffset);
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
@@ -1070,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 				rc = CIFSSMBWrite2(xid, pTcon,
 						open_file->netfid, len,
 						*poffset, &bytes_written,
-						iov, 1, long_op);
+						iov, 1, 0);
 			} else
 				rc = CIFSSMBWrite(xid, pTcon,
 					 open_file->netfid,
@@ -1078,7 +1006,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 					       write_size - total_written),
 					 *poffset, &bytes_written,
 					 write_data + total_written,
-					 NULL, long_op);
+					 NULL, 0);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
@@ -1091,8 +1019,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 			cifs_update_eof(cifsi, *poffset, bytes_written);
 			*poffset += bytes_written;
 		}
-		long_op = CIFS_STD_OP; /* subsequent writes fast -
-				    15 seconds is plenty */
 	}
 
 	cifs_stats_bytes_written(pTcon, total_written);
@@ -1292,7 +1218,7 @@ static int cifs_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int rc = 0;
 	int scanned = 0;
-	int xid, long_op;
+	int xid;
 
 	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
@@ -1430,43 +1356,67 @@ retry:
 				break;
 		}
 		if (n_iov) {
+retry_write:
 			open_file = find_writable_file(CIFS_I(mapping->host),
 							false);
 			if (!open_file) {
 				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
-				long_op = cifs_write_timeout(cifsi, offset);
 				rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
-						   long_op);
+						   0);
 				cifsFileInfo_put(open_file);
-				cifs_update_eof(cifsi, offset, bytes_written);
 			}
 
-			if (rc || bytes_written < bytes_to_write) {
-				cERROR(1, "Write2 ret %d, wrote %d",
-					  rc, bytes_written);
-				mapping_set_error(mapping, rc);
-			} else {
+			cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
+
+			/*
+			 * For now, treat a short write as if nothing got
+			 * written. A zero length write however indicates
+			 * ENOSPC or EFBIG. We have no way to know which
+			 * though, so call it ENOSPC for now. EFBIG would
+			 * get translated to AS_EIO anyway.
+			 *
+			 * FIXME: make it take into account the data that did
+			 *	  get written
+			 */
+			if (rc == 0) {
+				if (bytes_written == 0)
+					rc = -ENOSPC;
+				else if (bytes_written < bytes_to_write)
+					rc = -EAGAIN;
+			}
+
+			/* retry on data-integrity flush */
+			if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+				goto retry_write;
+
+			/* fix the stats and EOF */
+			if (bytes_written > 0) {
 				cifs_stats_bytes_written(tcon, bytes_written);
+				cifs_update_eof(cifsi, offset, bytes_written);
 			}
 
 			for (i = 0; i < n_iov; i++) {
 				page = pvec.pages[first + i];
-				/* Should we also set page error on
-				success rc but too little data written? */
-				/* BB investigate retry logic on temporary
-				server crash cases and how recovery works
-				when page marked as error */
-				if (rc)
+				/* on retryable write error, redirty page */
+				if (rc == -EAGAIN)
+					redirty_page_for_writepage(wbc, page);
+				else if (rc != 0)
 					SetPageError(page);
 				kunmap(page);
 				unlock_page(page);
 				end_page_writeback(page);
 				page_cache_release(page);
 			}
+
+			if (rc != -EAGAIN)
+				mapping_set_error(mapping, rc);
+			else
+				rc = 0;
+
 			if ((wbc->nr_to_write -= n_iov) <= 0)
 				done = 1;
 			index = next;
@@ -1578,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
-int cifs_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, int datasync)
 {
 	int xid;
 	int rc = 0;
 	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *smbfile = file->private_data;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
 	xid = GetXid();
 
 	cFYI(1, "Sync file - name: %s datasync: 0x%x",
 		file->f_path.dentry->d_name.name, datasync);
 
-	rc = filemap_write_and_wait(inode->i_mapping);
-	if (rc == 0) {
-		struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	if (!CIFS_I(inode)->clientCanCacheRead)
+		cifs_invalidate_mapping(inode);
 
-		tcon = tlink_tcon(smbfile->tlink);
-		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
-	}
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+
+	FreeXid(xid);
+	return rc;
+}
+
+int cifs_fsync(struct file *file, int datasync)
+{
+	int xid;
+	int rc = 0;
+	struct cifsTconInfo *tcon;
+	struct cifsFileInfo *smbfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+
+	xid = GetXid();
+
+	cFYI(1, "Sync file - name: %s datasync: 0x%x",
+		file->f_path.dentry->d_name.name, datasync);
+
+	tcon = tlink_tcon(smbfile->tlink);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 
 	FreeXid(xid);
 	return rc;
@@ -1649,42 +1619,42 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	return rc;
 }
 
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
-	size_t read_size, loff_t *poffset)
+static ssize_t
+cifs_iovec_read(struct file *file, const struct iovec *iov,
+		 unsigned long nr_segs, loff_t *poffset)
 {
-	int rc = -EACCES;
-	unsigned int bytes_read = 0;
-	unsigned int total_read = 0;
-	unsigned int current_read_size;
+	int rc;
+	int xid;
+	unsigned int total_read, bytes_read = 0;
+	size_t len, cur_len;
+	int iov_offset = 0;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
-	int xid;
 	struct cifsFileInfo *open_file;
-	char *smb_read_data;
-	char __user *current_offset;
 	struct smb_com_read_rsp *pSMBr;
+	char *read_data;
+
+	if (!nr_segs)
+		return 0;
+
+	len = iov_length(iov, nr_segs);
+	if (!len)
+		return 0;
 
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
-	if (file->private_data == NULL) {
-		rc = -EBADF;
-		FreeXid(xid);
-		return rc;
-	}
 	open_file = file->private_data;
 	pTcon = tlink_tcon(open_file->tlink);
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
 
-	for (total_read = 0, current_offset = read_data;
-	     read_size > total_read;
-	     total_read += bytes_read, current_offset += bytes_read) {
-		current_read_size = min_t(const int, read_size - total_read,
-					  cifs_sb->rsize);
+	for (total_read = 0; total_read < len; total_read += bytes_read) {
+		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
 		rc = -EAGAIN;
-		smb_read_data = NULL;
+		read_data = NULL;
+
 		while (rc == -EAGAIN) {
 			int buf_type = CIFS_NO_BUFFER;
 			if (open_file->invalidHandle) {
@@ -1692,27 +1662,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 				if (rc != 0)
 					break;
 			}
-			rc = CIFSSMBRead(xid, pTcon,
-					 open_file->netfid,
-					 current_read_size, *poffset,
-					 &bytes_read, &smb_read_data,
-					 &buf_type);
-			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-			if (smb_read_data) {
-				if (copy_to_user(current_offset,
-						smb_read_data +
-						4 /* RFC1001 length field */ +
-						le16_to_cpu(pSMBr->DataOffset),
-						bytes_read))
+			rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
+					 cur_len, *poffset, &bytes_read,
+					 &read_data, &buf_type);
+			pSMBr = (struct smb_com_read_rsp *)read_data;
+			if (read_data) {
+				char *data_offset = read_data + 4 +
+						le16_to_cpu(pSMBr->DataOffset);
+				if (memcpy_toiovecend(iov, data_offset,
+						      iov_offset, bytes_read))
 					rc = -EFAULT;
-
 				if (buf_type == CIFS_SMALL_BUFFER)
-					cifs_small_buf_release(smb_read_data);
+					cifs_small_buf_release(read_data);
 				else if (buf_type == CIFS_LARGE_BUFFER)
-					cifs_buf_release(smb_read_data);
-				smb_read_data = NULL;
+					cifs_buf_release(read_data);
+				read_data = NULL;
+				iov_offset += bytes_read;
 			}
 		}
+
 		if (rc || (bytes_read == 0)) {
 			if (total_read) {
 				break;
@@ -1725,13 +1693,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 			*poffset += bytes_read;
 		}
 	}
+
 	FreeXid(xid);
 	return total_read;
 }
 
+ssize_t cifs_user_read(struct file *file, char __user *read_data,
+		       size_t read_size, loff_t *poffset)
+{
+	struct iovec iov;
+	iov.iov_base = read_data;
+	iov.iov_len = read_size;
+
+	return cifs_iovec_read(file, &iov, 1, poffset);
+}
+
+static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	ssize_t read;
+
+	read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
+	if (read > 0)
+		iocb->ki_pos = pos;
+
+	return read;
+}
+
+ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	if (CIFS_I(inode)->clientCanCacheRead)
+		return generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+	/*
+	 * In strict cache mode we need to read from the server all the time
+	 * if we don't have level II oplock because the server can delay mtime
+	 * change - so we can't make a decision about inode invalidating.
+	 * And we can also fail with pagereading if there are mandatory locks
+	 * on pages affected by this read but not on the region from pos to
+	 * pos+len-1.
+	 */
+
+	return cifs_user_readv(iocb, iov, nr_segs, pos);
+}
 
 static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
-	loff_t *poffset)
+			 loff_t *poffset)
 {
 	int rc = -EACCES;
 	unsigned int bytes_read = 0;
@@ -1799,6 +1811,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	return total_read;
 }
 
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int rc, xid;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	xid = GetXid();
+
+	if (!CIFS_I(inode)->clientCanCacheRead)
+		cifs_invalidate_mapping(inode);
+
+	rc = generic_file_mmap(file, vma);
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int rc, xid;
@@ -2245,7 +2272,8 @@ void cifs_oplock_break(struct work_struct *work)
 	 */
 	if (!cfile->oplock_break_cancelled) {
 		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
+				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+				 cinode->clientCanCacheRead ? 1 : 0);
 		cFYI(1, "Oplock release rc = %d", rc);
 	}
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a853a89857a..8852470b4fb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
 #include "fscache.h"
 
 
-static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+static void cifs_set_ops(struct inode *inode)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 				inode->i_fop = &cifs_file_direct_nobrl_ops;
 			else
 				inode->i_fop = &cifs_file_direct_ops;
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				inode->i_fop = &cifs_file_strict_nobrl_ops;
+			else
+				inode->i_fop = &cifs_file_strict_ops;
 		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 			inode->i_fop = &cifs_file_nobrl_ops;
 		else { /* not direct, send byte range locks */
 			inode->i_fop = &cifs_file_ops;
 		}
 
-
 		/* check if server can support readpages */
 		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
 				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 		break;
 	case S_IFDIR:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-		if (is_dfs_referral) {
+		if (IS_AUTOMOUNT(inode)) {
 			inode->i_op = &cifs_dfs_referral_inode_operations;
 		} else {
 #else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	}
 	spin_unlock(&inode->i_lock);
 
-	cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+	if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+		inode->i_flags |= S_AUTOMOUNT;
+	cifs_set_ops(inode);
 }
 
 void
@@ -518,6 +524,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 
 	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -779,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
 	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
 		return 0;
 
+	/* use createtime like an i_generation field */
+	if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+		return 0;
+
 	/* don't match inode of different type */
 	if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
 		return 0;
@@ -796,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
 	struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
 
 	CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+	CIFS_I(inode)->createtime = fattr->cf_createtime;
 	return 0;
 }
 
@@ -1318,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
 	to set uid/gid */
 			inc_nlink(inode);
-			if (pTcon->nocase)
-				d_set_d_op(direntry, &cifs_ci_dentry_ops);
-			else
-				d_set_d_op(direntry, &cifs_dentry_ops);
 
 			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
 			cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1362,10 +1370,6 @@ mkdir_get_info:
 			rc = cifs_get_inode_info(&newinode, full_path, NULL,
 						 inode->i_sb, xid, NULL);
 
-		if (pTcon->nocase)
-			d_set_d_op(direntry, &cifs_ci_dentry_ops);
-		else
-			d_set_d_op(direntry, &cifs_dentry_ops);
 		d_instantiate(direntry, newinode);
 		 /* setting nlink not necessary except in cases where we
 		  * failed to get it from the server or was set bogus */
@@ -1679,7 +1683,7 @@ cifs_inode_needs_reval(struct inode *inode)
 /*
  * Zap the cache. Called when invalid_mapping flag is set.
  */
-static void
+void
 cifs_invalidate_mapping(struct inode *inode)
 {
 	int rc;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fe2f6a93c49..306769de2fb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -524,10 +524,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 			cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
 			      rc);
 		} else {
-			if (pTcon->nocase)
-				d_set_d_op(direntry, &cifs_ci_dentry_ops);
-			else
-				d_set_d_op(direntry, &cifs_dentry_ops);
 			d_instantiate(direntry, newinode);
 		}
 	}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 43f10281bc1..a09e077ba92 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -571,7 +571,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 
 				cifs_set_oplock_level(pCifsInode,
-						      pSMB->OplockLevel);
+					pSMB->OplockLevel ? OPLOCK_READ : 0);
 				/*
 				 * cifs_oplock_break_put() can't be called
 				 * from here.  Get reference after queueing
@@ -637,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
 	return;
 }
 
-/* Convert 16 bit Unicode pathname to wire format from string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-		 const struct nls_table *cp, int mapChars)
-{
-	int i, j, charlen;
-	int len_remaining = maxlen;
-	char src_char;
-	__u16 temp;
-
-	if (!mapChars)
-		return cifs_strtoUCS(target, source, PATH_MAX, cp);
-
-	for (i = 0, j = 0; i < maxlen; j++) {
-		src_char = source[i];
-		switch (src_char) {
-			case 0:
-				target[j] = 0;
-				goto ctoUCS_out;
-			case ':':
-				target[j] = cpu_to_le16(UNI_COLON);
-				break;
-			case '*':
-				target[j] = cpu_to_le16(UNI_ASTERIK);
-				break;
-			case '?':
-				target[j] = cpu_to_le16(UNI_QUESTION);
-				break;
-			case '<':
-				target[j] = cpu_to_le16(UNI_LESSTHAN);
-				break;
-			case '>':
-				target[j] = cpu_to_le16(UNI_GRTRTHAN);
-				break;
-			case '|':
-				target[j] = cpu_to_le16(UNI_PIPE);
-				break;
-			/* BB We can not handle remapping slash until
-			   all the calls to build_path_from_dentry
-			   are modified, as they use slash as separator BB */
-			/* case '\\':
-				target[j] = cpu_to_le16(UNI_SLASH);
-				break;*/
-			default:
-				charlen = cp->char2uni(source+i,
-					len_remaining, &temp);
-				/* if no match, use question mark, which
-				at least in some cases servers as wild card */
-				if (charlen < 1) {
-					target[j] = cpu_to_le16(0x003f);
-					charlen = 1;
-				} else
-					target[j] = cpu_to_le16(temp);
-				len_remaining -= charlen;
-				/* character may take more than one byte in the
-				   the source string, but will take exactly two
-				   bytes in the target string */
-				i += charlen;
-				continue;
-		}
-		i++; /* move to next char in source string */
-		len_remaining--;
-	}
-
-ctoUCS_out:
-	return i;
-}
-
 void
 cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62..8d9189f6447 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
-	cFYI(1, "Mapping smb error code %d to POSIX err %d",
-		 smberrcode, rc);
+	cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+		 le32_to_cpu(smb->Status.CifsError), rc);
 
 	/* generic corrective action e.g. reconnect SMB session on
 	 * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
 	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-		2 /* size of the bcc field */ + BCC(ptr));
+		2 /* size of the bcc field */ + get_bcc(ptr));
 }
 
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
 	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-		2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
+		2 /* size of the bcc field */ + get_bcc_le(ptr));
 }
 
 /* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ec5b68e3b92..7f25cc3d225 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 		return NULL;
 	}
 
-	if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
-		d_set_d_op(dentry, &cifs_ci_dentry_ops);
-	else
-		d_set_d_op(dentry, &cifs_dentry_ops);
-
 	alias = d_materialise_unique(dentry, inode);
 	if (alias != NULL) {
 		dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
 	fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
 	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 	fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
 	fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
 	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed..1adc9625a34 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
 		      const struct nls_table *nls_cp)
 {
 	int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 	return;
 }
 
-static int decode_ascii_ssetup(char **pbcc_area, int bleft,
+static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 			       struct cifsSesInfo *ses,
 			       const struct nls_table *nls_cp)
 {
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	return 0;
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* BB Move to ntlmssp.c eventually */
 
 /* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
 	__u32 flags;
 
+	memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
 	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
 	sec_blob->MessageType = NtLmNegotiate;
 
 	/* BB is NTLMV2 session security format easier to use here? */
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
 	if (ses->server->secMode &
 			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 				NTLMSSP_NEGOTIATE_EXTENDED_SEC;
 	}
 
-	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+	sec_blob->NegotiateFlags = cpu_to_le32(flags);
 
 	sec_blob->WorkstationName.BufferOffset = 0;
 	sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	flags = NTLMSSP_NEGOTIATE_56 |
 		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
 	if (ses->server->secMode &
 	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
 
 	tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
-	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+	sec_blob->NegotiateFlags = cpu_to_le32(flags);
 
 	sec_blob->LmChallengeResponse.BufferOffset =
 				cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->WorkstationName.MaximumLength = 0;
 	tmp += 2;
 
-	if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
-			!calc_seckey(ses)) {
+	if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
+		(ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+			&& !calc_seckey(ses)) {
 		memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
 		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
 	return rc;
 }
 
-
-static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
-				 struct cifsSesInfo *ses)
-{
-	build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
-	pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
-
-	return;
-}
-#endif
-
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	       const struct nls_table *nls_cp)
@@ -585,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	char *str_area;
 	SESSION_SETUP_ANDX *pSMB;
 	__u32 capabilities;
-	int count;
+	__u16 count;
 	int resp_buf_type;
 	struct kvec iov[3];
 	enum securityEnum type;
-	__u16 action;
-	int bytes_remaining;
+	__u16 action, bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 	u16 blob_len;
@@ -814,71 +803,70 @@ ssetup_ntlmssp_authenticate:
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
-	} else {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-		if (type == RawNTLMSSP) {
-			if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-				cERROR(1, "NTLMSSP requires Unicode support");
-				rc = -ENOSYS;
+	} else if (type == RawNTLMSSP) {
+		if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+			cERROR(1, "NTLMSSP requires Unicode support");
+			rc = -ENOSYS;
+			goto ssetup_exit;
+		}
+
+		cFYI(1, "ntlmssp session setup phase %d", phase);
+		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+		capabilities |= CAP_EXTENDED_SECURITY;
+		pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+		switch(phase) {
+		case NtLmNegotiate:
+			build_ntlmssp_negotiate_blob(
+				pSMB->req.SecurityBlob, ses);
+			iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+			iov[1].iov_base = pSMB->req.SecurityBlob;
+			pSMB->req.SecurityBlobLength =
+				cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+			break;
+		case NtLmAuthenticate:
+			/*
+			 * 5 is an empirical value, large enough to hold
+			 * authenticate message plus max 10 of av paris,
+			 * domain, user, workstation names, flags, etc.
+			 */
+			ntlmsspblob = kzalloc(
+				5*sizeof(struct _AUTHENTICATE_MESSAGE),
+				GFP_KERNEL);
+			if (!ntlmsspblob) {
+				cERROR(1, "Can't allocate NTLMSSP blob");
+				rc = -ENOMEM;
 				goto ssetup_exit;
 			}
 
-			cFYI(1, "ntlmssp session setup phase %d", phase);
-			pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-			capabilities |= CAP_EXTENDED_SECURITY;
-			pSMB->req.Capabilities |= cpu_to_le32(capabilities);
-			if (phase == NtLmNegotiate) {
-				setup_ntlmssp_neg_req(pSMB, ses);
-				iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-				iov[1].iov_base = &pSMB->req.SecurityBlob[0];
-			} else if (phase == NtLmAuthenticate) {
-				/* 5 is an empirical value, large enought to
-				 * hold authenticate message, max 10 of
-				 * av paris, doamin,user,workstation mames,
-				 * flags etc..
-				 */
-				ntlmsspblob = kmalloc(
-					5*sizeof(struct _AUTHENTICATE_MESSAGE),
-					GFP_KERNEL);
-				if (!ntlmsspblob) {
-					cERROR(1, "Can't allocate NTLMSSP");
-					rc = -ENOMEM;
-					goto ssetup_exit;
-				}
-
-				rc = build_ntlmssp_auth_blob(ntlmsspblob,
-							&blob_len, ses, nls_cp);
-				if (rc)
-					goto ssetup_exit;
-				iov[1].iov_len = blob_len;
-				iov[1].iov_base = ntlmsspblob;
-				pSMB->req.SecurityBlobLength =
-					cpu_to_le16(blob_len);
-				/* Make sure that we tell the server that we
-				   are using the uid that it just gave us back
-				   on the response (challenge) */
-				smb_buf->Uid = ses->Suid;
-			} else {
-				cERROR(1, "invalid phase %d", phase);
-				rc = -ENOSYS;
+			rc = build_ntlmssp_auth_blob(ntlmsspblob,
+						&blob_len, ses, nls_cp);
+			if (rc)
 				goto ssetup_exit;
-			}
-			/* unicode strings must be word aligned */
-			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
-				*bcc_ptr = 0;
-				bcc_ptr++;
-			}
-			unicode_oslm_strings(&bcc_ptr, nls_cp);
-		} else {
-			cERROR(1, "secType %d not supported!", type);
+			iov[1].iov_len = blob_len;
+			iov[1].iov_base = ntlmsspblob;
+			pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+			/*
+			 * Make sure that we tell the server that we are using
+			 * the uid that it just gave us back on the response
+			 * (challenge)
+			 */
+			smb_buf->Uid = ses->Suid;
+			break;
+		default:
+			cERROR(1, "invalid phase %d", phase);
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
-#else
+		/* unicode strings must be word aligned */
+		if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+			*bcc_ptr = 0;
+			bcc_ptr++;
+		}
+		unicode_oslm_strings(&bcc_ptr, nls_cp);
+	} else {
 		cERROR(1, "secType %d not supported!", type);
 		rc = -ENOSYS;
 		goto ssetup_exit;
-#endif
 	}
 
 	iov[2].iov_base = str_area;
@@ -887,10 +875,10 @@ ssetup_ntlmssp_authenticate:
 	count = iov[1].iov_len + iov[2].iov_len;
 	smb_buf->smb_buf_length += count;
 
-	BCC_LE(smb_buf) = cpu_to_le16(count);
+	put_bcc_le(count, smb_buf);
 
 	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
-			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
+			  CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
@@ -921,7 +909,7 @@ ssetup_ntlmssp_authenticate:
 	cFYI(1, "UID = %d ", ses->Suid);
 	/* response can have either 3 or 4 word count - Samba sends 3 */
 	/* and lanman response is 3 */
-	bytes_remaining = BCC(smb_buf);
+	bytes_remaining = get_bcc(smb_buf);
 	bcc_ptr = pByteArea(smb_buf);
 
 	if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc..c1ccca1a933 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
 
 extern mempool_t *cifs_mid_poolp;
 
-static struct mid_q_entry *
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+	wake_up_process(mid->callback_data);
+}
+
+struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
 	struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
 		/* when mid allocated can be before when sent */
 		temp->when_alloc = jiffies;
-		temp->tsk = current;
+
+		/*
+		 * The default is for the mid to be synchronous, so the
+		 * default callback just wakes up the current task.
+		 */
+		temp->callback = wake_up_task;
+		temp->callback_data = current;
 	}
 
-	spin_lock(&GlobalMid_Lock);
-	list_add_tail(&temp->qhead, &server->pending_mid_q);
 	atomic_inc(&midCount);
 	temp->midState = MID_REQUEST_ALLOCATED;
-	spin_unlock(&GlobalMid_Lock);
 	return temp;
 }
 
-static void
+void
 DeleteMidQEntry(struct mid_q_entry *midEntry)
 {
 #ifdef CONFIG_CIFS_STATS2
 	unsigned long now;
 #endif
-	spin_lock(&GlobalMid_Lock);
 	midEntry->midState = MID_FREE;
-	list_del(&midEntry->qhead);
 	atomic_dec(&midCount);
-	spin_unlock(&GlobalMid_Lock);
 	if (midEntry->largeBuf)
 		cifs_buf_release(midEntry->resp_buf);
 	else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
 	mempool_free(midEntry, cifs_mid_poolp);
 }
 
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+	spin_lock(&GlobalMid_Lock);
+	list_del(&mid->qhead);
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+}
+
 static int
 smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	if (ssocket == NULL)
 		return -ENOTSOCK; /* BB eventually add reconnect code here */
 
-	smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
+	smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
 	return smb_sendv(server, &iov, 1);
 }
 
-static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
+static int wait_for_free_request(struct TCP_Server_Info *server,
+				 const int long_op)
 {
 	if (long_op == CIFS_ASYNC_OP) {
 		/* oplock breaks must not be held up */
-		atomic_inc(&ses->server->inFlight);
+		atomic_inc(&server->inFlight);
 		return 0;
 	}
 
 	spin_lock(&GlobalMid_Lock);
 	while (1) {
-		if (atomic_read(&ses->server->inFlight) >=
-				cifs_max_pending){
+		if (atomic_read(&server->inFlight) >= cifs_max_pending) {
 			spin_unlock(&GlobalMid_Lock);
 #ifdef CONFIG_CIFS_STATS2
-			atomic_inc(&ses->server->num_waiters);
+			atomic_inc(&server->num_waiters);
 #endif
-			wait_event(ses->server->request_q,
-				   atomic_read(&ses->server->inFlight)
+			wait_event(server->request_q,
+				   atomic_read(&server->inFlight)
 				     < cifs_max_pending);
 #ifdef CONFIG_CIFS_STATS2
-			atomic_dec(&ses->server->num_waiters);
+			atomic_dec(&server->num_waiters);
 #endif
 			spin_lock(&GlobalMid_Lock);
 		} else {
-			if (ses->server->tcpStatus == CifsExiting) {
+			if (server->tcpStatus == CifsExiting) {
 				spin_unlock(&GlobalMid_Lock);
 				return -ENOENT;
 			}
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 
 			/* update # of requests on the wire to server */
 			if (long_op != CIFS_BLOCKING_OP)
-				atomic_inc(&ses->server->inFlight);
+				atomic_inc(&server->inFlight);
 			spin_unlock(&GlobalMid_Lock);
 			break;
 		}
@@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
 	if (*ppmidQ == NULL)
 		return -ENOMEM;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
 	return 0;
 }
 
-static int wait_for_response(struct cifsSesInfo *ses,
-			struct mid_q_entry *midQ,
-			unsigned long timeout,
-			unsigned long time_to_wait)
+static int
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
-	unsigned long curr_timeout;
+	int error;
 
-	for (;;) {
-		curr_timeout = timeout + jiffies;
-		wait_event_timeout(ses->server->response_q,
-			midQ->midState != MID_REQUEST_SUBMITTED, timeout);
+	error = wait_event_killable(server->response_q,
+				    midQ->midState != MID_REQUEST_SUBMITTED);
+	if (error < 0)
+		return -ERESTARTSYS;
 
-		if (time_after(jiffies, curr_timeout) &&
-			(midQ->midState == MID_REQUEST_SUBMITTED) &&
-			((ses->server->tcpStatus == CifsGood) ||
-			 (ses->server->tcpStatus == CifsNew))) {
+	return 0;
+}
 
-			unsigned long lrt;
 
-			/* We timed out. Is the server still
-			   sending replies ? */
-			spin_lock(&GlobalMid_Lock);
-			lrt = ses->server->lstrp;
-			spin_unlock(&GlobalMid_Lock);
+/*
+ * Send a SMB request and set the callback function in the mid to handle
+ * the result. Caller is responsible for dealing with timeouts.
+ */
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+		mid_callback_t *callback, void *cbdata)
+{
+	int rc;
+	struct mid_q_entry *mid;
 
-			/* Calculate time_to_wait past last receive time.
-			 Although we prefer not to time out if the
-			 server is still responding - we will time
-			 out if the server takes more than 15 (or 45
-			 or 180) seconds to respond to this request
-			 and has not responded to any request from
-			 other threads on the client within 10 seconds */
-			lrt += time_to_wait;
-			if (time_after(jiffies, lrt)) {
-				/* No replies for time_to_wait. */
-				cERROR(1, "server not responding");
-				return -1;
-			}
-		} else {
-			return 0;
-		}
+	rc = wait_for_free_request(server, CIFS_ASYNC_OP);
+	if (rc)
+		return rc;
+
+	mutex_lock(&server->srv_mutex);
+	mid = AllocMidQEntry(in_buf, server);
+	if (mid == NULL) {
+		mutex_unlock(&server->srv_mutex);
+		return -ENOMEM;
 	}
-}
 
+	/* put it on the pending_mid_q */
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&mid->qhead, &server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		goto out_err;
+	}
+
+	mid->callback = callback;
+	mid->callback_data = cbdata;
+	mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+	atomic_inc(&server->inSend);
+#endif
+	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+#ifdef CONFIG_CIFS_STATS2
+	atomic_dec(&server->inSend);
+	mid->when_sent = jiffies;
+#endif
+	mutex_unlock(&server->srv_mutex);
+	if (rc)
+		goto out_err;
+
+	return rc;
+out_err:
+	delete_mid(mid);
+	atomic_dec(&server->inFlight);
+	wake_up(&server->request_q);
+	return rc;
+}
 
 /*
  *
@@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 	return rc;
 }
 
+static int
+sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+	int rc = 0;
+
+	cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+		mid->mid, mid->midState);
+
+	spin_lock(&GlobalMid_Lock);
+	/* ensure that it's no longer on the pending_mid_q */
+	list_del_init(&mid->qhead);
+
+	switch (mid->midState) {
+	case MID_RESPONSE_RECEIVED:
+		spin_unlock(&GlobalMid_Lock);
+		return rc;
+	case MID_REQUEST_SUBMITTED:
+		/* socket is going down, reject all calls */
+		if (server->tcpStatus == CifsExiting) {
+			cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
+			       __func__, mid->mid, mid->command, mid->midState);
+			rc = -EHOSTDOWN;
+			break;
+		}
+	case MID_RETRY_NEEDED:
+		rc = -EAGAIN;
+		break;
+	default:
+		cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+			mid->mid, mid->midState);
+		rc = -EIO;
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	DeleteMidQEntry(mid);
+	return rc;
+}
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+		struct mid_q_entry *mid)
+{
+	int rc = 0;
+
+	/* -4 for RFC1001 length and +2 for BCC field */
+	in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+	in_buf->Command = SMB_COM_NT_CANCEL;
+	in_buf->WordCount = 0;
+	put_bcc_le(0, in_buf);
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		return rc;
+	}
+	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+	mutex_unlock(&server->srv_mutex);
+
+	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+		in_buf->Mid, rc);
+
+	return rc;
+}
+
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	     struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	int rc = 0;
 	int long_op;
 	unsigned int receive_len;
-	unsigned long timeout;
 	struct mid_q_entry *midQ;
 	struct smb_hdr *in_buf = iov[0].iov_base;
 
@@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	rc = wait_for_free_request(ses, long_op);
+	rc = wait_for_free_request(ses->server, long_op);
 	if (rc) {
 		cifs_small_buf_release(in_buf);
 		return rc;
@@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc < 0)
 		goto out;
 
-	if (long_op == CIFS_STD_OP)
-		timeout = 15 * HZ;
-	else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
-		timeout = 180 * HZ;
-	else if (long_op == CIFS_LONG_OP)
-		timeout = 45 * HZ; /* should be greater than
-			servers oplock break timeout (about 43 seconds) */
-	else if (long_op == CIFS_ASYNC_OP)
+	if (long_op == CIFS_ASYNC_OP)
 		goto out;
-	else if (long_op == CIFS_BLOCKING_OP)
-		timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
-	else {
-		cERROR(1, "unknown timeout flag %d", long_op);
-		rc = -EIO;
-		goto out;
-	}
-
-	/* wait for 15 seconds or until woken up due to response arriving or
-	   due to last connection to this server being unmounted */
-	if (signal_pending(current)) {
-		/* if signal pending do not hold up user for full smb timeout
-		but we still give response a chance to complete */
-		timeout = 2 * HZ;
-	}
-
-	/* No user interrupts in wait - wreaks havoc with performance */
-	wait_for_response(ses, midQ, timeout, 10 * HZ);
-
-	spin_lock(&GlobalMid_Lock);
 
-	if (midQ->resp_buf == NULL) {
-		cERROR(1, "No response to cmd %d mid %d",
-			midQ->command, midQ->mid);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
-			if (ses->server->tcpStatus == CifsExiting)
-				rc = -EHOSTDOWN;
-			else {
-				ses->server->tcpStatus = CifsNeedReconnect;
-				midQ->midState = MID_RETRY_NEEDED;
-			}
-		}
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0)
+		goto out;
 
-		if (rc != -EHOSTDOWN) {
-			if (midQ->midState == MID_RETRY_NEEDED) {
-				rc = -EAGAIN;
-				cFYI(1, "marking request for retry");
-			} else {
-				rc = -EIO;
-			}
-		}
-		spin_unlock(&GlobalMid_Lock);
-		DeleteMidQEntry(midQ);
-		/* Update # of requests on wire to server */
+	rc = sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
 		atomic_dec(&ses->server->inFlight);
 		wake_up(&ses->server->request_q);
 		return rc;
 	}
 
-	spin_unlock(&GlobalMid_Lock);
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -559,19 +632,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		if (receive_len >= sizeof(struct smb_hdr) - 4
 		    /* do not count RFC1001 header */  +
 		    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-			BCC(midQ->resp_buf) =
-				le16_to_cpu(BCC_LE(midQ->resp_buf));
+			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
 		if ((flags & CIFS_NO_RESP) == 0)
 			midQ->resp_buf = NULL;  /* mark it so buf will
 						   not be freed by
-						   DeleteMidQEntry */
+						   delete_mid */
 	} else {
 		rc = -EIO;
 		cFYI(1, "Bad MID state?");
 	}
 
 out:
-	DeleteMidQEntry(midQ);
+	delete_mid(midQ);
 	atomic_dec(&ses->server->inFlight);
 	wake_up(&ses->server->request_q);
 
@@ -585,7 +657,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 {
 	int rc = 0;
 	unsigned int receive_len;
-	unsigned long timeout;
 	struct mid_q_entry *midQ;
 
 	if (ses == NULL) {
@@ -610,7 +681,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses, long_op);
+	rc = wait_for_free_request(ses->server, long_op);
 	if (rc)
 		return rc;
 
@@ -649,64 +720,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc < 0)
 		goto out;
 
-	if (long_op == CIFS_STD_OP)
-		timeout = 15 * HZ;
-	/* wait for 15 seconds or until woken up due to response arriving or
-	   due to last connection to this server being unmounted */
-	else if (long_op == CIFS_ASYNC_OP)
+	if (long_op == CIFS_ASYNC_OP)
 		goto out;
-	else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
-		timeout = 180 * HZ;
-	else if (long_op == CIFS_LONG_OP)
-		timeout = 45 * HZ; /* should be greater than
-			servers oplock break timeout (about 43 seconds) */
-	else if (long_op == CIFS_BLOCKING_OP)
-		timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
-	else {
-		cERROR(1, "unknown timeout flag %d", long_op);
-		rc = -EIO;
-		goto out;
-	}
 
-	if (signal_pending(current)) {
-		/* if signal pending do not hold up user for full smb timeout
-		but we still give response a chance to complete */
-		timeout = 2 * HZ;
-	}
-
-	/* No user interrupts in wait - wreaks havoc with performance */
-	wait_for_response(ses, midQ, timeout, 10 * HZ);
-
-	spin_lock(&GlobalMid_Lock);
-	if (midQ->resp_buf == NULL) {
-		cERROR(1, "No response for cmd %d mid %d",
-			  midQ->command, midQ->mid);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
-			if (ses->server->tcpStatus == CifsExiting)
-				rc = -EHOSTDOWN;
-			else {
-				ses->server->tcpStatus = CifsNeedReconnect;
-				midQ->midState = MID_RETRY_NEEDED;
-			}
-		}
+	rc = wait_for_response(ses->server, midQ);
+	if (rc != 0)
+		goto out;
 
-		if (rc != -EHOSTDOWN) {
-			if (midQ->midState == MID_RETRY_NEEDED) {
-				rc = -EAGAIN;
-				cFYI(1, "marking request for retry");
-			} else {
-				rc = -EIO;
-			}
-		}
-		spin_unlock(&GlobalMid_Lock);
-		DeleteMidQEntry(midQ);
-		/* Update # of requests on wire to server */
+	rc = sync_mid_result(midQ, ses->server);
+	if (rc != 0) {
 		atomic_dec(&ses->server->inFlight);
 		wake_up(&ses->server->request_q);
 		return rc;
 	}
 
-	spin_unlock(&GlobalMid_Lock);
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -748,43 +775,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		if (receive_len >= sizeof(struct smb_hdr) - 4
 		    /* do not count RFC1001 header */  +
 		    (2 * out_buf->WordCount) + 2 /* bcc */ )
-			BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
 	} else {
 		rc = -EIO;
 		cERROR(1, "Bad MID state?");
 	}
 
 out:
-	DeleteMidQEntry(midQ);
+	delete_mid(midQ);
 	atomic_dec(&ses->server->inFlight);
 	wake_up(&ses->server->request_q);
 
 	return rc;
 }
 
-/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
-
-static int
-send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
-		struct mid_q_entry *midQ)
-{
-	int rc = 0;
-	struct cifsSesInfo *ses = tcon->ses;
-	__u16 mid = in_buf->Mid;
-
-	header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
-	in_buf->Mid = mid;
-	mutex_lock(&ses->server->srv_mutex);
-	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
-	if (rc) {
-		mutex_unlock(&ses->server->srv_mutex);
-		return rc;
-	}
-	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-	mutex_unlock(&ses->server->srv_mutex);
-	return rc;
-}
-
 /* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
    blocking lock to return. */
 
@@ -807,7 +811,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
 	pSMB->hdr.Mid = GetNextMid(ses->server);
 
 	return SendReceive(xid, ses, in_buf, out_buf,
-			&bytes_returned, CIFS_STD_OP);
+			&bytes_returned, 0);
 }
 
 int
@@ -845,7 +849,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
+	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
 	if (rc)
 		return rc;
 
@@ -863,7 +867,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
 	if (rc) {
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		mutex_unlock(&ses->server->srv_mutex);
 		return rc;
 	}
@@ -880,7 +884,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0) {
-		DeleteMidQEntry(midQ);
+		delete_mid(midQ);
 		return rc;
 	}
 
@@ -899,10 +903,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		if (in_buf->Command == SMB_COM_TRANSACTION2) {
 			/* POSIX lock. We send a NT_CANCEL SMB to cause the
 			   blocking lock to return. */
-
-			rc = send_nt_cancel(tcon, in_buf, midQ);
+			rc = send_nt_cancel(ses->server, in_buf, midQ);
 			if (rc) {
-				DeleteMidQEntry(midQ);
+				delete_mid(midQ);
 				return rc;
 			}
 		} else {
@@ -914,47 +917,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			/* If we get -ENOLCK back the lock may have
 			   already been removed. Don't exit in this case. */
 			if (rc && rc != -ENOLCK) {
-				DeleteMidQEntry(midQ);
+				delete_mid(midQ);
 				return rc;
 			}
 		}
 
-		/* Wait 5 seconds for the response. */
-		if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
+		if (wait_for_response(ses->server, midQ) == 0) {
 			/* We got the response - restart system call. */
 			rstart = 1;
 		}
 	}
 
-	spin_lock(&GlobalMid_Lock);
-	if (midQ->resp_buf) {
-		spin_unlock(&GlobalMid_Lock);
-		receive_len = midQ->resp_buf->smb_buf_length;
-	} else {
-		cERROR(1, "No response for cmd %d mid %d",
-			  midQ->command, midQ->mid);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
-			if (ses->server->tcpStatus == CifsExiting)
-				rc = -EHOSTDOWN;
-			else {
-				ses->server->tcpStatus = CifsNeedReconnect;
-				midQ->midState = MID_RETRY_NEEDED;
-			}
-		}
-
-		if (rc != -EHOSTDOWN) {
-			if (midQ->midState == MID_RETRY_NEEDED) {
-				rc = -EAGAIN;
-				cFYI(1, "marking request for retry");
-			} else {
-				rc = -EIO;
-			}
-		}
-		spin_unlock(&GlobalMid_Lock);
-		DeleteMidQEntry(midQ);
+	rc = sync_mid_result(midQ, ses->server);
+	if (rc != 0)
 		return rc;
-	}
 
+	receive_len = midQ->resp_buf->smb_buf_length;
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
 		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
 			receive_len, xid);
@@ -998,10 +976,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	if (receive_len >= sizeof(struct smb_hdr) - 4
 	    /* do not count RFC1001 header */  +
 	    (2 * out_buf->WordCount) + 2 /* bcc */ )
-		BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+		put_bcc(get_bcc_le(out_buf), out_buf);
 
 out:
-	DeleteMidQEntry(midQ);
+	delete_mid(midQ);
 	if (rstart && rc == -EACCES)
 		return -ERESTARTSYS;
 	return rc;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5525e1c660f..69015787618 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
 #include <linux/spinlock.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 static atomic_t permission_epoch = ATOMIC_INIT(0);
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c8..6475877b076 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
 #include <linux/time.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 {
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 00000000000..c910b5eb1ce
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 00000000000..e35071b1de0
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+	struct CodaFid	   c_fid;	/* Coda identifier */
+	u_short	           c_flags;     /* flags (see below) */
+	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
+	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
+	vuid_t		   c_uid;	/* fsuid for cached permissions */
+	unsigned int       c_cached_perm; /* cached access permissions */
+	spinlock_t	   c_lock;
+	struct inode	   vfs_inode;
+};
+
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+	int		   cfi_magic;	  /* magic number */
+	struct file	  *cfi_container; /* container file for this cnode */
+	unsigned int	   cfi_mapcount;  /* nr of times this file is mapped */
+};
+
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+
+#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e..2bdbcc11b37 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
 #include <linux/string.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 
 /* initialize the debugging variables */
 int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 00000000000..9b0c5323890
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>		
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+
+extern const struct dentry_operations coda_dentry_operations;
+
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vmalloc((unsigned long) size); \
+    if (!ptr) \
+        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+    else memset( ptr, 0, size ); \
+} while (0)
+
+
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+
+/* inode to cnode access functions */
+
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+	return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+	return &(ITOC(inode)->c_fid);
+}
+
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+	return coda_f2s(&(ITOC(inode)->c_fid));
+}
+
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_flags |= flag;
+	spin_unlock(&cii->c_lock);
+}		
+
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 29badd91360..2b8dae4d121 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,10 +23,9 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 #include "coda_int.h"
 
@@ -61,7 +60,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
 
-static const struct dentry_operations coda_dentry_operations =
+const struct dentry_operations coda_dentry_operations =
 {
 	.d_revalidate	= coda_dentry_revalidate,
 	.d_delete	= coda_dentry_delete,
@@ -126,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		return ERR_PTR(error);
 
 exit:
-	d_set_d_op(entry, &coda_dentry_operations);
-
 	if (inode && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366..0433057be33 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include "coda_linux.h"
 #include "coda_int.h"
 
 static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 50dc7d189f5..871b2771546 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 #include "coda_int.h"
 
@@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep;
 static struct inode *coda_alloc_inode(struct super_block *sb)
 {
 	struct coda_inode_info *ei;
-	ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	memset(&ei->c_fid, 0, sizeof(struct CodaFid));
@@ -193,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
+	sb->s_d_op = &coda_dentry_operations;
 	sb->s_bdi = &vc->bdi;
 
 	/* get root fid from Venus: this needs the root inode */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 741f0bd0391..6cbb3afb36d 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,10 +19,10 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include "coda_linux.h"
+
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e..8f616e0e252 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include "coda_linux.h"
+
 #include "coda_int.h"
 
 /* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b..ab94ef63cae 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
 #include <linux/pagemap.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+
+#include "coda_linux.h"
 
 static int coda_symlink_filler(struct file *file, struct page *page)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab975..9727e0c5257 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
 #include <linux/vfs.h>
 
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
 
 #include "coda_int.h"
 
diff --git a/fs/compat.c b/fs/compat.c
index eb1740ac8c0..f6fd0a00e6c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 }
 
 /*
- * The following statfs calls are copies of code from fs/open.c and
+ * The following statfs calls are copies of code from fs/statfs.c and
  * should be checked against those from time to time
  */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
 	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
 	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-	    __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
 		return -EFAULT;
 	return 0;
 }
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	if (nr_segs > fast_segs) {
 		ret = -ENOMEM;
 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-		if (iov == NULL) {
-			*ret_pointer = fast_pointer;
+		if (iov == NULL)
 			goto out;
-		}
 	}
 	*ret_pointer = iov;
 
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0..9febcdefdfd 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem"
-	depends on SYSFS
+	select SYSFS
 	help
-	  configfs is a ram-based filesystem that provides the converse
+	  configfs is a RAM-based filesystem that provides the converse
 	  of sysfs's functionality. Where sysfs is a filesystem-based
 	  view of kernel objects, configfs is a filesystem-based manager
 	  of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 026cf68553a..82bda8fdfc1 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
 extern const struct file_operations bin_fops;
 extern const struct inode_operations configfs_dir_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
 
 extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
 			    const char *symname);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 36637a8c1ed..90ff3cb10de 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(const struct dentry *dentry)
 	return 1;
 }
 
-static const struct dentry_operations configfs_dentry_ops = {
+const struct dentry_operations configfs_dentry_ops = {
 	.d_iput		= configfs_d_iput,
 	/* simple_delete_dentry() isn't exported */
 	.d_delete	= configfs_d_delete,
@@ -442,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 		return error;
 	}
 
-	d_set_d_op(dentry, &configfs_dentry_ops);
 	d_rehash(dentry);
 
 	return 0;
@@ -489,7 +488,6 @@ static struct dentry * configfs_lookup(struct inode *dir,
 		 */
 		if (dentry->d_name.len > NAME_MAX)
 			return ERR_PTR(-ENAMETOOLONG);
-		d_set_d_op(dentry, &configfs_dentry_ops);
 		d_add(dentry, NULL);
 		return NULL;
 	}
@@ -683,7 +681,6 @@ static int create_default_group(struct config_group *parent_group,
 	ret = -ENOMEM;
 	child = d_alloc(parent, &name);
 	if (child) {
-		d_set_d_op(child, &configfs_dentry_ops);
 		d_add(child, NULL);
 
 		ret = configfs_attach_group(&parent_group->cg_item,
@@ -1681,7 +1678,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	err = -ENOMEM;
 	dentry = d_alloc(configfs_sb->s_root, &name);
 	if (dentry) {
-		d_set_d_op(dentry, &configfs_dentry_ops);
 		d_add(dentry, NULL);
 
 		err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7d3607febe1..ecc62178bed 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	configfs_root_group.cg_item.ci_dentry = root;
 	root->d_fsdata = &configfs_root;
 	sb->s_root = root;
+	sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
 	return 0;
 }
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0..e141939080f 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
 static DEFINE_MUTEX(read_mutex);
 
 
-/* These two macros may change in future, to provide better st_ino
-   semantics. */
-#define CRAMINO(x)	(((x)->offset && (x)->size)?(x)->offset<<2:1)
+/* These macros may change in future, to provide better st_ino semantics. */
 #define OFFSET(x)	((x)->i_ino)
 
-static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
+static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
 {
+	if (!cino->offset)
+		return offset + 1;
+	if (!cino->size)
+		return offset + 1;
+
+	/*
+	 * The file mode test fixes buggy mkcramfs implementations where
+	 * cramfs_inode->offset is set to a non zero value for entries
+	 * which did not contain data, like devices node and fifos.
+	 */
+	switch (cino->mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		return cino->offset << 2;
+	default:
+		break;
+	}
+	return offset + 1;
+}
+
+static struct inode *get_cramfs_inode(struct super_block *sb,
+	struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+	struct inode *inode;
 	static struct timespec zerotime;
+
+	inode = iget_locked(sb, cramino(cramfs_inode, offset));
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	switch (cramfs_inode->mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_fop = &generic_ro_fops;
+		inode->i_data.a_ops = &cramfs_aops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &cramfs_dir_inode_operations;
+		inode->i_fop = &cramfs_directory_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &cramfs_aops;
+		break;
+	default:
+		init_special_inode(inode, cramfs_inode->mode,
+				old_decode_dev(cramfs_inode->size));
+	}
+
 	inode->i_mode = cramfs_inode->mode;
 	inode->i_uid = cramfs_inode->uid;
-	inode->i_size = cramfs_inode->size;
-	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
 	inode->i_gid = cramfs_inode->gid;
+
+	/* if the lower 2 bits are zero, the inode contains data */
+	if (!(inode->i_ino & 3)) {
+		inode->i_size = cramfs_inode->size;
+		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+	}
+
 	/* Struct copy intentional */
 	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
 	/* inode->i_nlink is left 1 - arguably wrong for directories,
 	   but it's the best we can do without reading the directory
 	   contents.  1 yields the right result in GNU find, even
 	   without -noleaf option. */
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_fop = &generic_ro_fops;
-		inode->i_data.a_ops = &cramfs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &cramfs_dir_inode_operations;
-		inode->i_fop = &cramfs_directory_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &page_symlink_inode_operations;
-		inode->i_data.a_ops = &cramfs_aops;
-	} else {
-		init_special_inode(inode, inode->i_mode,
-			old_decode_dev(cramfs_inode->size));
-	}
-}
 
-static struct inode *get_cramfs_inode(struct super_block *sb,
-				struct cramfs_inode * cramfs_inode)
-{
-	struct inode *inode;
-	if (CRAMINO(cramfs_inode) == 1) {
-		inode = new_inode(sb);
-		if (inode) {
-			inode->i_ino = 1;
-			setup_inode(inode, cramfs_inode);
-		}
-	} else {
-		inode = iget_locked(sb, CRAMINO(cramfs_inode));
-		if (inode && (inode->i_state & I_NEW)) {
-			setup_inode(inode, cramfs_inode);
-			unlock_new_inode(inode);
-		}
-	}
+	unlock_new_inode(inode);
+
 	return inode;
 }
 
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "cramfs: root is not a directory\n");
 		goto out;
 	}
+	/* correct strange, hard-coded permissions of mkcramfs */
+	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+
 	root_offset = super.root.offset << 2;
 	if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
 		sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Set it all up.. */
 	sb->s_op = &cramfs_ops;
-	root = get_cramfs_inode(sb, &super.root);
+	root = get_cramfs_inode(sb, &super.root, 0);
 	if (!root)
 		goto out;
 	sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		 */
 		namelen = de->namelen << 2;
 		memcpy(buf, name, namelen);
-		ino = CRAMINO(de);
+		ino = cramino(de, OFFSET(inode) + offset);
 		mode = de->mode;
 		mutex_unlock(&read_mutex);
 		nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 		struct cramfs_inode *de;
 		char *name;
 		int namelen, retval;
+		int dir_off = OFFSET(dir) + offset;
 
-		de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+		de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
 		name = (char *)(de+1);
 
 		/* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 		if (!retval) {
 			struct cramfs_inode entry = *de;
 			mutex_unlock(&read_mutex);
-			d_add(dentry, get_cramfs_inode(dir->i_sb, &entry));
+			d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
 			return NULL;
 		}
 		/* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 5699d4c027c..2a6bd9a4ae9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
 
 /**
  * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 /**
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
+ * @parent: parent dentry
  *
  * The dentry must already be unhashed and removed from the LRU.
  *
@@ -1320,6 +1322,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		__dget_dlock(parent);
 		dentry->d_parent = parent;
 		dentry->d_sb = parent->d_sb;
+		d_set_d_op(dentry, dentry->d_sb->s_d_op);
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 		spin_unlock(&parent->d_lock);
 	}
@@ -1335,6 +1338,7 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 	struct dentry *dentry = d_alloc(NULL, name);
 	if (dentry) {
 		dentry->d_sb = sb;
+		d_set_d_op(dentry, dentry->d_sb->s_d_op);
 		dentry->d_parent = dentry;
 		dentry->d_flags |= DCACHE_DISCONNECTED;
 	}
@@ -1355,8 +1359,8 @@ EXPORT_SYMBOL(d_alloc_name);
 
 void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 {
-	BUG_ON(dentry->d_op);
-	BUG_ON(dentry->d_flags & (DCACHE_OP_HASH	|
+	WARN_ON_ONCE(dentry->d_op);
+	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|
 				DCACHE_OP_COMPARE	|
 				DCACHE_OP_REVALIDATE	|
 				DCACHE_OP_DELETE ));
@@ -1378,8 +1382,11 @@ EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
-	if (inode)
+	if (inode) {
+		if (unlikely(IS_AUTOMOUNT(inode)))
+			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
 		list_add(&dentry->d_alias, &inode->i_dentry);
+	}
 	dentry->d_inode = inode;
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -1507,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 		res = d_alloc(NULL, &name);
 		if (res) {
 			res->d_sb = root_inode->i_sb;
+			d_set_d_op(res, res->d_sb->s_d_op);
 			res->d_parent = res;
 			d_instantiate(res, root_inode);
 		}
@@ -1567,6 +1575,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	/* attach a disconnected dentry */
 	spin_lock(&tmp->d_lock);
 	tmp->d_sb = inode->i_sb;
+	d_set_d_op(tmp, tmp->d_sb->s_d_op);
 	tmp->d_inode = inode;
 	tmp->d_flags |= DCACHE_DISCONNECTED;
 	list_add(&tmp->d_alias, &inode->i_dentry);
@@ -2449,8 +2458,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 
 /**
- * Prepend path string to a buffer
- *
+ * prepend_path - Prepend path string to a buffer
  * @path: the dentry/vfsmount to report
  * @root: root vfsmnt/dentry (may be modified by this function)
  * @buffer: pointer to the end of the buffer
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f..b044705eedd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
 
-static int
+static void
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		sector_t first_sector, int nr_vecs)
 {
 	struct bio *bio;
 
+	/*
+	 * bio_alloc() is guaranteed to return a bio when called with
+	 * __GFP_WAIT and we request a valid number of vectors.
+	 */
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
 	bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 
 	dio->bio = bio;
 	dio->logical_offset_in_bio = dio->cur_page_fs_offset;
-	return 0;
 }
 
 /*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
 		goto out;
 	sector = start_sector << (dio->blkbits - 9);
 	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
-	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+	dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
 	dio->boundary = 0;
 out:
 	return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e811..1897eb1b4b6 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
 menuconfig DLM
 	tristate "Distributed Lock Manager (DLM)"
 	depends on EXPERIMENTAL && INET
-	depends on SYSFS && (IPV6 || IPV6=n)
-	select CONFIGFS_FS
+	depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
 	select IP_SCTP
 	help
 	A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622..9c64ae9e4c1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
 
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
+
 struct cbuf {
 	unsigned int base;
 	unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
 	struct connection *con = sock2con(sk);
 
-	if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+	if (!con)
+		return;
+
+	clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+		con->sock->sk->sk_write_pending--;
+		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+	}
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
 		queue_work(send_workqueue, &con->swork);
 }
 
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
 	struct sockaddr_storage saddr, src_addr;
 	int addr_len;
 	struct socket *sock = NULL;
+	int one = 1;
 
 	if (con->nodeid == 0) {
 		log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
 	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
 	log_print("connecting to %d", con->nodeid);
+
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
 	result =
 		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
 				   O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		goto create_out;
 	}
 
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
 	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
 				   (char *)&one, sizeof(one));
 
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
 	int len, offset;
+	int count = 0;
 
 	mutex_lock(&con->sock_mutex);
 	if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
 			ret = kernel_sendpage(con->sock, e->page, offset, len,
 					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
+				if (ret == -EAGAIN &&
+				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+					/* Notify TCP that we're limited by the
+					 * application window size.
+					 */
+					set_bit(SOCK_NOSPACE, &con->sock->flags);
+					con->sock->sk->sk_write_pending++;
+				}
 				cond_resched();
 				goto out;
 			}
 			if (ret <= 0)
 				goto send_error;
 		}
-			/* Don't starve people filling buffers */
+
+		/* Don't starve people filling buffers */
+		if (++count >= MAX_SEND_MSG_COUNT) {
 			cond_resched();
+			count = 0;
+		}
 
 		spin_lock(&con->writequeue_lock);
 		e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
 
 static int work_start(void)
 {
-	int error;
-	recv_workqueue = create_workqueue("dlm_recv");
-	error = IS_ERR(recv_workqueue);
-	if (error) {
-		log_print("can't start dlm_recv %d", error);
-		return error;
+	recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+	if (!recv_workqueue) {
+		log_print("can't start dlm_recv");
+		return -ENOMEM;
 	}
 
-	send_workqueue = create_singlethread_workqueue("dlm_send");
-	error = IS_ERR(send_workqueue);
-	if (error) {
-		log_print("can't start dlm_send %d", error);
+	send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+	if (!send_workqueue) {
+		log_print("can't start dlm_send");
 		destroy_workqueue(recv_workqueue);
-		return error;
+		return -ENOMEM;
 	}
 
 	return 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e..bfd8b680e64 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 	BUG_ON(!crypt_stat || !crypt_stat->tfm
 	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+		ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
 				crypt_stat->key_size);
 		ecryptfs_dump_hex(crypt_stat->key,
 				  crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
 				(extent_base + extent_offset));
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error attempting to "
-				"derive IV for extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
 	if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 	}
 	rc = 0;
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
+			"rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
 				"encryption:\n");
 		ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
 	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
 				(extent_base + extent_offset));
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error attempting to "
-				"derive IV for extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+			"extent [0x%.16llx]; rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		goto out;
 	}
 	if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
 	}
 	rc = 0;
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; "
-				"rc = [%d]\n", (extent_base + extent_offset),
-				rc);
+		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
+			"rc = [%d]\n",
+			(unsigned long long)(extent_base + extent_offset), rc);
 		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
 				"decryption:\n");
 		ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
 	}
 	ecryptfs_printk(KERN_DEBUG,
 			"Initializing cipher [%s]; strlen = [%d]; "
-			"key_size_bits = [%d]\n",
+			"key_size_bits = [%zd]\n",
 			crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
 			crypt_stat->key_size << 3);
 	if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0b..dbc84ed9633 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
 		(((struct user_key_payload*)key->payload.data)->data);
 }
 
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 
 #define ecryptfs_printk(type, fmt, arg...) \
         __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
 void __ecryptfs_printk(const char *fmt, ...);
 
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bf..81e10e6a944 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
 				const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
-	int rc;
+	ssize_t rc;
 	struct dentry *lower_dentry;
 	struct vfsmount *lower_vfsmount;
 	struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 				      | ECRYPTFS_ENCRYPTED);
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
-	if (!ecryptfs_inode_to_private(inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       ecryptfs_dentry->d_name.name, rc);
-			goto out_free;
-		}
+	rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the persistent file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out_free;
 	}
-	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
-	    && !(file->f_flags & O_RDONLY)) {
+	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
+	    == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
 		rc = -EPERM;
 		printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
 		       "file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		}
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
-	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
-			"size: [0x%.16x]\n", inode, inode->i_ino,
-			i_size_read(inode));
+	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
+			"[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
+			(unsigned long long)i_size_read(inode));
 	goto out;
 out_free:
 	kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 337352a9475..bd33f87a190 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
 				"context; rc = [%d]\n", rc);
 		goto out;
 	}
-	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       ecryptfs_dentry->d_name.name, rc);
-			goto out;
-		}
+	rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the persistent file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out;
 	}
 	rc = ecryptfs_write_metadata(ecryptfs_dentry);
 	if (rc) {
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		rc = -ENOMEM;
 		goto out;
 	}
-	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       ecryptfs_dentry->d_name.name, rc);
-			goto out_free_kmem;
-		}
+	rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the persistent file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			ecryptfs_dentry->d_name.name, rc);
+		goto out_free_kmem;
 	}
 	crypt_stat = &ecryptfs_inode_to_private(
 					ecryptfs_dentry->d_inode)->crypt_stat;
@@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	struct qstr lower_name;
 	int rc = 0;
 
-	d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
 	if ((ecryptfs_dentry->d_name.len == 1
 	     && !strcmp(ecryptfs_dentry->d_name.name, "."))
 	    || (ecryptfs_dentry->d_name.len == 2
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a522..c1436cff6f2 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
 		break;
 	default:
 		ecryptfs_printk(KERN_WARNING, "Unknown error code: "
-				"[0x%.16x]\n", err_code);
+				"[0x%.16lx]\n", err_code);
 		rc = -EINVAL;
 	}
 	return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
 	} else {
 		rc = -EINVAL;
 		ecryptfs_printk(KERN_WARNING,
-				"Unsupported packet size: [%d]\n", size);
+				"Unsupported packet size: [%zd]\n", size);
 	}
 	return rc;
 }
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	       auth_tok->session_key.decrypted_key_size);
 	crypt_stat->flags |= ECRYPTFS_KEY_VALID;
 	if (unlikely(ecryptfs_verbosity > 0)) {
-		ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n",
+		ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
 				crypt_stat->key_size);
 		ecryptfs_dump_hex(crypt_stat->key,
 				  crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 			if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
 				ecryptfs_printk(KERN_ERR, "Expected "
 						"signature of size [%d]; "
-						"read size [%d]\n",
+						"read size [%zd]\n",
 						ECRYPTFS_SIG_SIZE,
 						tag_11_contents_size);
 				rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 			goto out_wipe_list;
 			break;
 		default:
-			ecryptfs_printk(KERN_DEBUG, "No packet at offset "
-					"[%d] of the file header; hex value of "
+			ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
+					"of the file header; hex value of "
 					"character is [0x%.2x]\n", i, src[i]);
 			next_packet_is_auth_tok_packet = 0;
 		}
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
 				"session key for authentication token with sig "
 				"[%.*s]; rc = [%d]. Removing auth tok "
 				"candidate from the list and searching for "
-				"the next match.\n", candidate_auth_tok_sig,
-				ECRYPTFS_SIG_SIZE_HEX, rc);
+				"the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
+				candidate_auth_tok_sig,	rc);
 		list_for_each_entry_safe(auth_tok_list_item,
 					 auth_tok_list_item_tmp,
 					 &auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	if (encrypted_session_key_valid) {
 		ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
 				"using auth_tok->session_key.encrypted_key, "
-				"where key_rec->enc_key_size = [%d]\n",
+				"where key_rec->enc_key_size = [%zd]\n",
 				key_rec->enc_key_size);
 		memcpy(key_rec->enc_key,
 		       auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat session key; expected rc = 1; "
-				"got rc = [%d]. key_rec->enc_key_size = [%d]\n",
+				"got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
 				rc, key_rec->enc_key_size);
 		rc = -ENOMEM;
 		goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat encrypted session key; "
 				"expected rc = 1; got rc = [%d]. "
-				"key_rec->enc_key_size = [%d]\n", rc,
+				"key_rec->enc_key_size = [%zd]\n", rc,
 				key_rec->enc_key_size);
 		rc = -ENOMEM;
 		goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		goto out;
 	}
 	rc = 0;
-	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
 			crypt_stat->key_size);
 	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
 				      (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	}
 	ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
 	if (ecryptfs_verbosity > 0) {
-		ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n",
+		ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
 				key_rec->enc_key_size);
 		ecryptfs_dump_hex(key_rec->enc_key,
 				  key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 35103867537..758323a0f09 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 	return rc;
 }
 
-/**
- * ecryptfs_interpose
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-		       struct super_block *sb, u32 flags)
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+		       struct super_block *sb)
 {
-	struct inode *lower_inode;
 	struct inode *inode;
 	int rc = 0;
 
-	lower_inode = lower_dentry->d_inode;
 	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
 		rc = -EXDEV;
 		goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 	if (special_file(lower_inode->i_mode))
 		init_special_inode(inode, lower_inode->i_mode,
 				   lower_inode->i_rdev);
-	d_set_d_op(dentry, &ecryptfs_dops);
 	fsstack_copy_attr_all(inode, lower_inode);
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
 	fsstack_copy_inode_size(inode, lower_inode);
+	return inode;
+out:
+	return ERR_PTR(rc);
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flags: flags to govern behavior of interpose procedure
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+		       struct super_block *sb, u32 flags)
+{
+	struct inode *lower_inode = lower_dentry->d_inode;
+	struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
 		d_add(dentry, inode);
 	else
 		d_instantiate(dentry, inode);
-out:
-	return rc;
+	return 0;
 }
 
 enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache;
 static struct file_system_type ecryptfs_fs_type;
 
 /**
- * ecryptfs_read_super
- * @sb: The ecryptfs super block
- * @dev_name: The path to mount over
- *
- * Read the super block of the lower filesystem, and use
- * ecryptfs_interpose to create our initial inode and super block
- * struct.
- */
-static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
-{
-	struct path path;
-	int rc;
-
-	rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-	if (rc) {
-		ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
-		goto out;
-	}
-	if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
-		rc = -EINVAL;
-		printk(KERN_ERR "Mount on filesystem of type "
-			"eCryptfs explicitly disallowed due to "
-			"known incompatibilities\n");
-		goto out_free;
-	}
-	ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
-	sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
-	sb->s_blocksize = path.dentry->d_sb->s_blocksize;
-	ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
-	ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
-	rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
-	if (rc)
-		goto out_free;
-	rc = 0;
-	goto out;
-out_free:
-	path_put(&path);
-out:
-	return rc;
-}
-
-/**
  * ecryptfs_get_sb
  * @fs_type
  * @flags
  * @dev_name: The path to mount over
  * @raw_data: The options passed into the kernel
- *
- * The whole ecryptfs_get_sb process is broken into 3 functions:
- * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_read_super(): this accesses the lower filesystem and uses
- *                        ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
  */
 static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
 			const char *dev_name, void *raw_data)
@@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	struct ecryptfs_sb_info *sbi;
 	struct ecryptfs_dentry_info *root_info;
 	const char *err = "Getting sb failed";
+	struct inode *inode;
+	struct path path;
 	int rc;
 
 	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 
 	s->s_flags = flags;
 	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
-	if (rc) {
-		deactivate_locked_super(s);
-		goto out;
-	}
+	if (rc)
+		goto out1;
 
 	ecryptfs_set_superblock_private(s, sbi);
 	s->s_bdi = &sbi->bdi;
@@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	/* ->kill_sb() will take care of sbi after that point */
 	sbi = NULL;
 	s->s_op = &ecryptfs_sops;
+	s->s_d_op = &ecryptfs_dops;
 
-	rc = -ENOMEM;
-	s->s_root = d_alloc(NULL, &(const struct qstr) {
-			     .hash = 0,.name = "/",.len = 1});
+	err = "Reading sb failed";
+	rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+		goto out1;
+	}
+	if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Mount on filesystem of type "
+			"eCryptfs explicitly disallowed due to "
+			"known incompatibilities\n");
+		goto out_free;
+	}
+	ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+	s->s_blocksize = path.dentry->d_sb->s_blocksize;
+	s->s_magic = ECRYPTFS_SUPER_MAGIC;
+
+	inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+	rc = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_free;
+
+	s->s_root = d_alloc_root(inode);
 	if (!s->s_root) {
-		deactivate_locked_super(s);
-		goto out;
+		iput(inode);
+		rc = -ENOMEM;
+		goto out_free;
 	}
-	d_set_d_op(s->s_root, &ecryptfs_dops);
-	s->s_root->d_sb = s;
-	s->s_root->d_parent = s->s_root;
 
+	rc = -ENOMEM;
 	root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
-	if (!root_info) {
-		deactivate_locked_super(s);
-		goto out;
-	}
+	if (!root_info)
+		goto out_free;
+
 	/* ->kill_sb() will take care of root_info */
 	ecryptfs_set_dentry_private(s->s_root, root_info);
+	ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+	ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
+
 	s->s_flags |= MS_ACTIVE;
-	rc = ecryptfs_read_super(s, dev_name);
-	if (rc) {
-		deactivate_locked_super(s);
-		err = "Reading sb failed";
-		goto out;
-	}
 	return dget(s->s_root);
 
+out_free:
+	path_put(&path);
+out1:
+	deactivate_locked_super(s);
 out:
 	if (sbi) {
 		ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
@@ -828,9 +810,10 @@ static int __init ecryptfs_init(void)
 		ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
 				"larger than the host's page size, and so "
 				"eCryptfs cannot run on this system. The "
-				"default eCryptfs extent size is [%d] bytes; "
-				"the page size is [%d] bytes.\n",
-				ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+				"default eCryptfs extent size is [%u] bytes; "
+				"the page size is [%lu] bytes.\n",
+				ECRYPTFS_DEFAULT_EXTENT_SIZE,
+				(unsigned long)PAGE_CACHE_SIZE);
 		goto out;
 	}
 	rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544..cc64fca89f8 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 	rc = ecryptfs_encrypt_page(page);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error encrypting "
-				"page (upper index [0x%.16x])\n", page->index);
+				"page (upper index [0x%.16lx])\n", page->index);
 		ClearPageUptodate(page);
 		goto out;
 	}
@@ -237,7 +237,7 @@ out:
 		ClearPageUptodate(page);
 	else
 		SetPageUptodate(page);
-	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
 			page->index);
 	unlock_page(page);
 	return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
 		return -ENOMEM;
 	*pagep = page;
 
+	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
 	if (!PageUptodate(page)) {
 		struct ecryptfs_crypt_stat *crypt_stat =
 			&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
 				SetPageUptodate(page);
 			}
 		} else {
-			rc = ecryptfs_decrypt_page(page);
-			if (rc) {
-				printk(KERN_ERR "%s: Error decrypting page "
-				       "at index [%ld]; rc = [%d]\n",
-				       __func__, page->index, rc);
-				ClearPageUptodate(page);
-				goto out;
+			if (prev_page_end_size
+			    >= i_size_read(page->mapping->host)) {
+				zero_user(page, 0, PAGE_CACHE_SIZE);
+			} else {
+				rc = ecryptfs_decrypt_page(page);
+				if (rc) {
+					printk(KERN_ERR "%s: Error decrypting "
+					       "page at index [%ld]; "
+					       "rc = [%d]\n",
+					       __func__, page->index, rc);
+					ClearPageUptodate(page);
+					goto out;
+				}
 			}
 			SetPageUptodate(page);
 		}
 	}
-	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
 	/* If creating a page or more of holes, zero them out via truncate.
 	 * Note, this will increase i_size. */
 	if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
 	} else
 		ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
 	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
-			"(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+			"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
 	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
 						       to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
 	rc = fill_zeros_to_end_of_page(page, to);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
-			"zeros in page with index = [0x%.16x]\n", index);
+			"zeros in page with index = [0x%.16lx]\n", index);
 		goto out;
 	}
 	rc = ecryptfs_encrypt_page(page);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
-				"index [0x%.16x])\n", index);
+				"index [0x%.16lx])\n", index);
 		goto out;
 	}
 	if (pos + copied > i_size_read(ecryptfs_inode)) {
 		i_size_write(ecryptfs_inode, pos + copied);
 		ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
-				"[0x%.16x]\n", i_size_read(ecryptfs_inode));
+			"[0x%.16llx]\n",
+			(unsigned long long)i_size_read(ecryptfs_inode));
 	}
 	rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
 	if (rc)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067..cc8a9b7d606 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
  * Configuration options available inside /proc/sys/fs/epoll/
  */
 /* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
 
 /*
  * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
 
 #include <linux/sysctl.h>
 
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
 
 ctl_table epoll_table[] = {
 	{
 		.procname	= "max_user_watches",
 		.data		= &max_user_watches,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(max_user_watches),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
+		.extra2		= &long_max,
 	},
 	{ }
 };
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	/* At this point it is safe to free the eventpoll item */
 	kmem_cache_free(epi_cache, epi);
 
-	atomic_dec(&ep->user->epoll_watches);
+	atomic_long_dec(&ep->user->epoll_watches);
 
 	return 0;
 }
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 {
 	int error, revents, pwake = 0;
 	unsigned long flags;
+	long user_watches;
 	struct epitem *epi;
 	struct ep_pqueue epq;
 
-	if (unlikely(atomic_read(&ep->user->epoll_watches) >=
-		     max_user_watches))
+	user_watches = atomic_long_read(&ep->user->epoll_watches);
+	if (unlikely(user_watches >= max_user_watches))
 		return -ENOSPC;
 	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
 		return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-	atomic_inc(&ep->user->epoll_watches);
+	atomic_long_inc(&ep->user->epoll_watches);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
 	 */
 	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
 		EP_ITEM_COST;
+	BUG_ON(max_user_watches < 0);
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206a..47cda410b54 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
 
 typedef struct ext2_dir_entry_2 ext2_dirent;
 
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
 {
 	unsigned len = le16_to_cpu(dlen);
 
+#if (PAGE_CACHE_SIZE >= 65536)
 	if (len == EXT2_MAX_REC_LEN)
 		return 1 << 16;
+#endif
 	return len;
 }
 
 static inline __le16 ext2_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
 	if (len == (1 << 16))
 		return cpu_to_le16(EXT2_MAX_REC_LEN);
 	else
 		BUG_ON(len > (1 << 16));
+#endif
 	return cpu_to_le16(len);
 }
 
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
 		p = (ext2_dirent *)(kaddr + offs);
 		rec_len = ext2_rec_len_from_disk(p->rec_len);
 
-		if (rec_len < EXT2_DIR_REC_LEN(1))
+		if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
 			goto Eshort;
-		if (rec_len & 3)
+		if (unlikely(rec_len & 3))
 			goto Ealign;
-		if (rec_len < EXT2_DIR_REC_LEN(p->name_len))
+		if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
 			goto Enamelen;
-		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+		if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
 			goto Espan;
-		if (le32_to_cpu(p->inode) > max_inumber)
+		if (unlikely(le32_to_cpu(p->inode) > max_inumber))
 			goto Einumber;
 	}
 	if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e329..2e1d8341d82 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 	inode = NULL;
 	if (ino) {
 		inode = ext2_iget(dir->i_sb, ino);
-		if (unlikely(IS_ERR(inode))) {
+		if (IS_ERR(inode)) {
 			if (PTR_ERR(inode) == -ESTALE) {
 				ext2_error(dir->i_sb, __func__,
 						"deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e0c6380ff99..7731695e65d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
 
-void ext2_error (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+void ext2_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
 	}
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
 	va_end(args);
 
 	if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
 void ext2_msg(struct super_block *sb, const char *prefix,
 		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
 	va_end(args);
 }
 
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be327..c2e4dce984d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
 			goto found;
 		entry = next;
 	}
-	/* Check the remaining name entries */
-	while (!IS_LAST_ENTRY(entry)) {
-		struct ext2_xattr_entry *next =
-			EXT2_XATTR_NEXT(entry);
-		if ((char *)next >= end)
-			goto bad_block;
-		entry = next;
-	}
 	if (ext2_xattr_cache_insert(bh))
 		ea_idebug(inode, "cache insert failed");
 	error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 /*
  * ext2_xattr_set()
  *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
  * is NULL to remove an existing extended attribute, and non-NULL to
  * either replace an existing extended attribute, or create a new extended
  * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db2264942..045995c8ce5 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+	ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	if (offsetp)
+		*offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+	if (blockgrpp)
+		*blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
+
 /**
  * ext3_get_group_desc() -- load group descriptor from disk
  * @sb:			super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
 	return ext3_bg_num_gdb_meta(sb,group);
 
 }
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		allocation group to trim
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @gdp:		allocation group description structure
+ * @minblocks:		minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+				ext3_grpblk_t start, ext3_grpblk_t max,
+				ext3_grpblk_t minblocks)
+{
+	handle_t *handle;
+	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+	ext3_fsblk_t discard_block;
+	struct ext3_sb_info *sbi;
+	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+	struct ext3_group_desc *gdp;
+	int err = 0, ret = 0;
+
+	/*
+	 * We will update one block bitmap, and one group descriptor
+	 */
+	handle = ext3_journal_start_sb(sb, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	bitmap_bh = read_block_bitmap(sb, group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto err_out;
+
+	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto err_out;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	sbi = EXT3_SB(sb);
+
+	 /* Walk through the whole group */
+	while (start < max) {
+		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+		if (start < 0)
+			break;
+		next = start;
+
+		/*
+		 * Allocate contiguous free extents by setting bits in the
+		 * block bitmap
+		 */
+		while (next < max
+			&& claim_block(sb_bgl_lock(sbi, group),
+					next, bitmap_bh)) {
+			next++;
+		}
+
+		 /* We did not claim any blocks */
+		if (next == start)
+			continue;
+
+		discard_block = (ext3_fsblk_t)start +
+				ext3_group_first_block_no(sb, group);
+
+		/* Update counters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+		/* Do not issue a TRIM on extents smaller than minblocks */
+		if ((next - start) < minblocks)
+			goto free_extent;
+
+		 /* Send the TRIM command down to the device */
+		err = sb_issue_discard(sb, discard_block, next - start,
+				       GFP_NOFS, 0);
+		count += (next - start);
+free_extent:
+		freed = 0;
+
+		/*
+		 * Clear bits in the bitmap
+		 */
+		for (bit = start; bit < next; bit++) {
+			BUFFER_TRACE(bitmap_bh, "clear bit");
+			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+						bit, bitmap_bh->b_data)) {
+				ext3_error(sb, __func__,
+					"bit already cleared for block "E3FSBLK,
+					 (unsigned long)bit);
+				BUFFER_TRACE(bitmap_bh, "bit already cleared");
+			} else {
+				freed++;
+			}
+		}
+
+		/* Update couters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+
+		start = next;
+		if (err < 0) {
+			if (err != -EOPNOTSUPP)
+				ext3_warning(sb, __func__, "Discard command "
+					     "returned error %d\n", err);
+			break;
+		}
+
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+
+		/* No more suitable extents */
+		if ((free_blocks - count) < minblocks)
+			break;
+	}
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (!err)
+		err = ret;
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (!err)
+		err = ret;
+
+	ext3_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+err_out:
+	if (err)
+		count = err;
+	ext3_journal_stop(handle);
+	brelse(bitmap_bh);
+
+	return count;
+}
+
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @start:		First Byte to trim
+ * @len:		number of Bytes to trim from start
+ * @minlen:		minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	ext3_grpblk_t last_block, first_block, free_blocks;
+	unsigned long first_group, last_group;
+	unsigned long group, ngroups;
+	struct ext3_group_desc *gdp;
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	uint64_t start, len, minlen, trimmed;
+	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	len = range->len >> sb->s_blocksize_bits;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+	trimmed = 0;
+
+	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+	if (start >= max_blks)
+		goto out;
+	if (start < le32_to_cpu(es->s_first_data_block)) {
+		len -= le32_to_cpu(es->s_first_data_block) - start;
+		start = le32_to_cpu(es->s_first_data_block);
+	}
+	if (start + len > max_blks)
+		len = max_blks - start;
+
+	ngroups = EXT3_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/* Determine first and last group to examine based on start and len */
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+				     &first_group, &first_block);
+	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+				     &last_group, &last_block);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+	last_block = EXT3_BLOCKS_PER_GROUP(sb);
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	for (group = first_group; group <= last_group; group++) {
+		gdp = ext3_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			break;
+
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free_blocks < minlen)
+			continue;
+
+		if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+			len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+		else
+			last_block = first_block + len;
+
+		ret = ext3_trim_all_free(sb, group, first_block,
+					last_block, minlen);
+		if (ret < 0)
+			break;
+
+		trimmed += ret;
+		first_block = 0;
+	}
+
+	if (ret >= 0)
+		ret = 0;
+
+out:
+	range->len = trimmed * sb->s_blocksize;
+
+	return ret;
+}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf..34f0a072b93 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
 	const char * error_msg = NULL;
 	const int rlen = ext3_rec_len_from_disk(de->rec_len);
 
-	if (rlen < EXT3_DIR_REC_LEN(1))
+	if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
 		error_msg = "rec_len is smaller than minimal";
-	else if (rlen % 4 != 0)
+	else if (unlikely(rlen % 4 != 0))
 		error_msg = "rec_len % 4 != 0";
-	else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+	else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
 		error_msg = "rec_len is too small for name_len";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+	else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
 		error_msg = "directory entry across blocks";
-	else if (le32_to_cpu(de->inode) >
-			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
 		error_msg = "inode out of bounds";
 
-	if (error_msg != NULL)
+	if (unlikely(error_msg != NULL))
 		ext3_error (dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
 			(unsigned long) le32_to_cpu(de->inode),
 			rlen, de->name_len);
+
 	return error_msg == NULL ? 1 : 0;
 }
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd..ae94f6d949f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-			ext3_journal_dirty_metadata(handle, bh);
+			if (ext3_journal_dirty_metadata(handle, bh))
+				return;
 		}
 		ext3_mark_inode_dirty(handle, inode);
 		truncate_restart_transaction(handle, inode);
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
-			ext3_journal_get_write_access(handle, bh);
+			if (ext3_journal_get_write_access(handle, bh))
+				return;
 		}
 	}
 
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783..fc080dd561f 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
+	case FITRIM: {
 
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+				   sizeof(range)))
+			return -EFAULT;
+
+		ret = ext3_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+				 sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
 
 	default:
 		return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b..b27ba71810e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
 	struct buffer_head * bh_use[NAMEI_RA_SIZE];
 	struct buffer_head * bh, *ret = NULL;
 	unsigned long start, block, b;
+	const u8 *name = entry->name;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	int ra_ptr = 0;		/* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
 	namelen = entry->len;
 	if (namelen > EXT3_NAME_LEN)
 		return NULL;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == 0)) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
 	if (is_dx(dir)) {
 		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
 		/*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
 			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
 			int *err)
 {
-	struct super_block * sb;
+	struct super_block *sb = dir->i_sb;
 	struct dx_hash_info	hinfo;
-	u32 hash;
 	struct dx_frame frames[2], *frame;
-	struct ext3_dir_entry_2 *de, *top;
 	struct buffer_head *bh;
 	unsigned long block;
 	int retval;
-	int namelen = entry->len;
-	const u8 *name = entry->name;
 
-	sb = dir->i_sb;
-	/* NFS may look up ".." - look at dx_root directory block */
-	if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
-		if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-			return NULL;
-	} else {
-		frame = frames;
-		frame->bh = NULL;			/* for dx_release() */
-		frame->at = (struct dx_entry *)frames;	/* hack for zero entry*/
-		dx_set_block(frame->at, 0);		/* dx_root block is 0 */
-	}
-	hash = hinfo.hash;
+	if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
+		return NULL;
 	do {
 		block = dx_get_block(frame->at);
 		if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
 			goto errout;
-		de = (struct ext3_dir_entry_2 *) bh->b_data;
-		top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-				       EXT3_DIR_REC_LEN(0));
-		for (; de < top; de = ext3_next_entry(de)) {
-			int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-				  + ((char *) de - bh->b_data);
-
-			if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-				brelse(bh);
-				*err = ERR_BAD_DX_DIR;
-				goto errout;
-			}
 
-			if (ext3_match(namelen, name, de)) {
-				*res_dir = de;
-				dx_release(frames);
-				return bh;
-			}
+		retval = search_dirblock(bh, dir, entry,
+					 block << EXT3_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) {
+			dx_release(frames);
+			return bh;
 		}
-		brelse (bh);
+		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
 		/* Check to see if we should continue to search */
-		retval = ext3_htree_next_block(dir, hash, frame,
+		retval = ext3_htree_next_block(dir, hinfo.hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
 			ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 			return ERR_PTR(-EIO);
 		}
 		inode = ext3_iget(dir->i_sb, ino);
-		if (unlikely(IS_ERR(inode))) {
+		if (IS_ERR(inode)) {
 			if (PTR_ERR(inode) == -ESTALE) {
 				ext3_error(dir->i_sb, __func__,
 						"deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		ext3_journal_dirty_metadata(handle, frames[0].bh);
+		err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+		if (err)
+			goto journal_error;
 	}
 	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
 	if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
 		if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
 			return -EIO;
 		if (de == de_del)  {
+			int err;
+
 			BUFFER_TRACE(bh, "get_write_access");
-			ext3_journal_get_write_access(handle, bh);
+			err = ext3_journal_get_write_access(handle, bh);
+			if (err)
+				goto journal_error;
+
 			if (pde)
 				pde->rec_len = ext3_rec_len_to_disk(
 					ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
 				de->inode = 0;
 			dir->i_version++;
 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-			ext3_journal_dirty_metadata(handle, bh);
+			err = ext3_journal_dirty_metadata(handle, bh);
+			if (err) {
+journal_error:
+				ext3_std_error(dir->i_sb, err);
+				return err;
+			}
 			return 0;
 		}
 		i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
 	handle_t *handle;
 	struct inode * inode;
-	struct buffer_head * dir_block;
+	struct buffer_head * dir_block = NULL;
 	struct ext3_dir_entry_2 * de;
 	int err, retries = 0;
 
@@ -1790,15 +1793,14 @@ retry:
 	inode->i_fop = &ext3_dir_operations;
 	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
 	dir_block = ext3_bread (handle, inode, 0, 1, &err);
-	if (!dir_block) {
-		drop_nlink(inode); /* is this nlink == 0? */
-		unlock_new_inode(inode);
-		ext3_mark_inode_dirty(handle, inode);
-		iput (inode);
-		goto out_stop;
-	}
+	if (!dir_block)
+		goto out_clear_inode;
+
 	BUFFER_TRACE(dir_block, "get_write_access");
-	ext3_journal_get_write_access(handle, dir_block);
+	err = ext3_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+
 	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
 	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-	ext3_journal_dirty_metadata(handle, dir_block);
-	brelse (dir_block);
-	ext3_mark_inode_dirty(handle, inode);
-	err = ext3_add_entry (handle, dentry, inode);
+	err = ext3_journal_dirty_metadata(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
+
+	err = ext3_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext3_add_entry (handle, dentry, inode);
+
 	if (err) {
+out_clear_inode:
 		inode->i_nlink = 0;
 		unlock_new_inode(inode);
 		ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
 	}
 	inc_nlink(dir);
 	ext3_update_dx_flag(dir);
-	ext3_mark_inode_dirty(handle, dir);
+	err = ext3_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
+
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 out_stop:
+	brelse(dir_block);
 	ext3_journal_stop(handle);
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 			goto end_rename;
 	} else {
 		BUFFER_TRACE(new_bh, "get write access");
-		ext3_journal_get_write_access(handle, new_bh);
+		retval = ext3_journal_get_write_access(handle, new_bh);
+		if (retval)
+			goto journal_error;
 		new_de->inode = cpu_to_le32(old_inode->i_ino);
 		if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
 					      EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
 		ext3_mark_inode_dirty(handle, new_dir);
 		BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-		ext3_journal_dirty_metadata(handle, new_bh);
+		retval = ext3_journal_dirty_metadata(handle, new_bh);
+		if (retval)
+			goto journal_error;
 		brelse(new_bh);
 		new_bh = NULL;
 	}
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	ext3_update_dx_flag(old_dir);
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");
-		ext3_journal_get_write_access(handle, dir_bh);
+		retval = ext3_journal_get_write_access(handle, dir_bh);
+		if (retval)
+			goto journal_error;
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-		ext3_journal_dirty_metadata(handle, dir_bh);
+		retval = ext3_journal_dirty_metadata(handle, dir_bh);
+		if (retval) {
+journal_error:
+			ext3_std_error(new_dir->i_sb, retval);
+			goto end_rename;
+		}
 		drop_nlink(old_dir);
 		if (new_inode) {
 			drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b123..108b142e11e 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
 		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
 		set_buffer_uptodate(gdb);
 		unlock_buffer(gdb);
-		ext3_journal_dirty_metadata(handle, gdb);
+		err = ext3_journal_dirty_metadata(handle, gdb);
+		if (err) {
+			brelse(gdb);
+			goto exit_bh;
+		}
 		ext3_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
 			err = PTR_ERR(gdb);
 			goto exit_bh;
 		}
-		ext3_journal_dirty_metadata(handle, gdb);
+		err = ext3_journal_dirty_metadata(handle, gdb);
+		if (err) {
+			brelse(gdb);
+			goto exit_bh;
+		}
 		ext3_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
 			err = PTR_ERR(it);
 			goto exit_bh;
 		}
-		ext3_journal_dirty_metadata(handle, it);
+		err = ext3_journal_dirty_metadata(handle, it);
+		if (err) {
+			brelse(it);
+			goto exit_bh;
+		}
 		brelse(it);
 		ext3_set_bit(bit, bh->b_data);
 	}
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
 
 	mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
 			bh->b_data);
-	ext3_journal_dirty_metadata(handle, bh);
+	err = ext3_journal_dirty_metadata(handle, bh);
+	if (err)
+		goto exit_bh;
 	brelse(bh);
 
 	/* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 
 	mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
 			bh->b_data);
-	ext3_journal_dirty_metadata(handle, bh);
+	err = ext3_journal_dirty_metadata(handle, bh);
 exit_bh:
 	brelse(bh);
 
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	 * reserved inode, and will become GDT blocks (primary and backup).
 	 */
 	data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-	ext3_journal_dirty_metadata(handle, dind);
+	err = ext3_journal_dirty_metadata(handle, dind);
+	if (err)
+		goto exit_group_desc;
 	brelse(dind);
+	dind = NULL;
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-	ext3_mark_iloc_dirty(handle, inode, &iloc);
+	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+	if (err)
+		goto exit_group_desc;
 	memset((*primary)->b_data, 0, sb->s_blocksize);
-	ext3_journal_dirty_metadata(handle, *primary);
+	err = ext3_journal_dirty_metadata(handle, *primary);
+	if (err)
+		goto exit_group_desc;
 
 	o_group_desc = EXT3_SB(sb)->s_group_desc;
 	memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	kfree(o_group_desc);
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	if (err)
+		goto exit_inode;
 
 	return 0;
 
+exit_group_desc:
+	kfree(n_group_desc);
 exit_inode:
 	//ext3_journal_release_buffer(handle, iloc.bh);
 	brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
 		}
 		ext3_debug("update metadata backup %#04lx\n",
 			  (unsigned long)bh->b_blocknr);
-		if ((err = ext3_journal_get_write_access(handle, bh)))
+		if ((err = ext3_journal_get_write_access(handle, bh))) {
+			brelse(bh);
 			break;
+		}
 		lock_buffer(bh);
 		memcpy(bh->b_data, data, size);
 		if (rest)
 			memset(bh->b_data + size, 0, rest);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
-		ext3_journal_dirty_metadata(handle, bh);
+		err = ext3_journal_dirty_metadata(handle, bh);
 		brelse(bh);
+		if (err)
+			break;
 	}
 	if ((err2 = ext3_journal_stop(handle)) && !err)
 		err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
 
-	ext3_journal_dirty_metadata(handle, primary);
+	err = ext3_journal_dirty_metadata(handle, primary);
+	if (err)
+		goto exit_journal;
 
 	/* Update the reserved block counts only once the new group is
 	 * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
 
-	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+	err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 
 exit_journal:
 	mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 		goto exit_put;
 	}
 	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+	if (err) {
+		ext3_warning(sb, __func__,
+			     "error %d on journal dirty metadata", err);
+		ext3_journal_stop(handle);
+		goto exit_put;
+	}
 	ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
 		   o_blocks_count, o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 77ce1616f72..85c8cc8f247 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -143,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 void ext3_msg(struct super_block *sb, const char *prefix,
 		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+
 	va_end(args);
 }
 
@@ -195,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
 			sb->s_id);
 }
 
-void ext3_error (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+void ext3_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
 	va_end(args);
 
 	ext3_handle_error(sb);
@@ -274,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
  * case we take the easy way out and panic immediately.
  */
 
-void ext3_abort (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+void ext3_abort(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
 	va_end(args);
 
 	if (test_opt(sb, ERRORS_PANIC))
@@ -300,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
 		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
 
-void ext3_warning (struct super_block * sb, const char * function,
-		   const char * fmt, ...)
+void ext3_warning(struct super_block *sb, const char *function,
+		  const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
-	       sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
 	va_end(args);
 }
 
@@ -346,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -363,8 +381,7 @@ fail:
  */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -737,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-				char *path);
+			 struct path *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -1848,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (generic_check_addressable(sb->s_blocksize_bits,
-				      le32_to_cpu(es->s_blocks_count))) {
+	err = generic_check_addressable(sb->s_blocksize_bits,
+					le32_to_cpu(es->s_blocks_count));
+	if (err) {
 		ext3_msg(sb, KERN_ERR,
 			"error: filesystem is too large to mount safely");
 		if (sizeof(sector_t) < 8)
 			ext3_msg(sb, KERN_ERR,
 				"error: CONFIG_LBDAF not enabled");
+		ret = err;
 		goto failed_mount;
 	}
 
@@ -2142,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext3_msg(sb, KERN_ERR,
-			"error: failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
@@ -2297,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
 	EXT3_SB(sb)->s_journal = journal;
 	ext3_clear_journal_err(sb, es);
 
-	if (journal_devnum &&
+	if (!really_read_only && journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
 
@@ -2865,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name)
+			 struct path *path)
 {
 	int err;
-	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
-
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
-		path_put(&path);
+	if (path->mnt->mnt_sb != sb)
 		return -EXDEV;
-	}
 	/* Journaling quota? */
 	if (EXT3_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			ext3_msg(sb, KERN_WARNING,
 				"warning: Quota file not on filesystem root. "
 				"Journaled quota will not work.");
@@ -2895,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext3_should_journal_data(path.dentry->d_inode)) {
+	if (ext3_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -2903,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
 		err = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-		if (err) {
-			path_put(&path);
+		if (err)
 			return err;
-		}
 	}
 
-	err = dquot_quota_on_path(sb, type, format_id, &path);
-	path_put(&path);
-	return err;
+	return dquot_quota_on(sb, type, format_id, path);
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa8..32e6cc23bd9 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
  * ext3_xattr_set_handle()
  *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
  * is NULL to remove an existing extended attribute, and non-NULL to
  * either replace an existing extended attribute, or create a new extended
  * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c67..adf96b82278 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	 * Account for the allocated meta blocks.  We will never
 	 * fail EDQUOT for metdata, but we do account for it.
 	 */
-	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+	if (!(*errp) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40..164c56092e5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 	return (ext4_filetype_table[filetype]);
 }
 
-
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-			   struct inode *dir,
+			   struct inode *dir, struct file *filp,
 			   struct ext4_dir_entry_2 *de,
 			   struct buffer_head *bh,
 			   unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 	const int rlen = ext4_rec_len_from_disk(de->rec_len,
 						dir->i_sb->s_blocksize);
 
-	if (rlen < EXT4_DIR_REC_LEN(1))
+	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
 		error_msg = "rec_len is smaller than minimal";
-	else if (rlen % 4 != 0)
+	else if (unlikely(rlen % 4 != 0))
 		error_msg = "rec_len % 4 != 0";
-	else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
 		error_msg = "rec_len is too small for name_len";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+	else if (unlikely(((char *) de - bh->b_data) + rlen >
+			  dir->i_sb->s_blocksize))
 		error_msg = "directory entry across blocks";
-	else if (le32_to_cpu(de->inode) >
-			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
 		error_msg = "inode out of bounds";
+	else
+		return 0;
 
-	if (error_msg != NULL)
-		ext4_error_inode(dir, function, line, bh->b_blocknr,
-			"bad entry in directory: %s - "
-			"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-			error_msg, (unsigned) (offset%bh->b_size), offset,
-			le32_to_cpu(de->inode),
-			rlen, de->name_len);
-	return error_msg == NULL ? 1 : 0;
+	if (filp)
+		ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset%bh->b_size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+	else
+		ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset%bh->b_size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+
+	return 1;
 }
 
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				EXT4_ERROR_INODE(inode, "directory "
-					   "contains a hole at offset %Lu",
+				EXT4_ERROR_FILE(filp, 0,
+						"directory contains a "
+						"hole at offset %llu",
 					   (unsigned long long) filp->f_pos);
 				dir_has_error = 1;
 			}
@@ -194,8 +210,8 @@ revalidate:
 		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext4_check_dir_entry(inode, de,
-						  bh, offset)) {
+			if (ext4_check_dir_entry(inode, filp, de,
+						 bh, offset)) {
 				/*
 				 * On error, skip the f_pos to the next block
 				 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94ce3d7a1c4..0c8d97b56f3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
 	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
 
-#define EXT4_ERROR_FILE(file, fmt, a...)	\
-	ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
+	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -561,23 +561,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 #endif
 
-
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-	unsigned long s_mount_opt;
-	uid_t s_resuid;
-	gid_t s_resgid;
-	unsigned long s_commit_interval;
-	u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-	int s_jquota_fmt;
-	char *s_qf_names[MAXQUOTAS];
-#endif
-};
-
-/* Max physical block we can addres w/o extents */
+/* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
 
 /*
@@ -709,6 +693,8 @@ do {									       \
 	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
 		ext4_decode_extra_time(&(inode)->xtime,			       \
 				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(inode)->xtime.tv_nsec = 0;				       \
 } while (0)
 
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
@@ -719,6 +705,8 @@ do {									       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
 		ext4_decode_extra_time(&(einode)->xtime,		       \
 				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(einode)->xtime.tv_nsec = 0;				       \
 } while (0)
 
 #define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do {									       \
 
 /*
  * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
  */
 struct ext4_ext_cache {
 	ext4_fsblk_t	ec_start;
 	ext4_lblk_t	ec_block;
 	__u32		ec_len; /* must be 32bit to return holes */
-	__u32		ec_type;
 };
 
 /*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
 	 * near to their parent directory's inode.
 	 */
 	ext4_group_t	i_block_group;
+	ext4_lblk_t	i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
 	unsigned long	i_state_flags;		/* Dynamic state flags */
+#endif
 	unsigned long	i_flags;
 
-	ext4_lblk_t		i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
-	struct jbd2_inode jinode;
+	struct jbd2_inode *jinode;
 
 	struct ext4_ext_cache i_cached_extent;
 	/*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
 	unsigned int i_allocated_meta_blocks;
-	unsigned short i_delalloc_reserved_flag;
-	sector_t i_da_metadata_calc_last_lblock;
+	ext4_lblk_t i_da_metadata_calc_last_lblock;
 	int i_da_metadata_calc_len;
 
 	/* on-disk additional length */
 	__u16 i_extra_isize;
 
-	spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
 	/* quota space reservation, managed internally by quota code */
 	qsize_t i_reserved_quota;
@@ -856,9 +845,11 @@ struct ext4_inode_info {
 	/* completed IOs that might need unwritten extents handling */
 	struct list_head i_completed_io_list;
 	spinlock_t i_completed_io_lock;
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
-	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
+
+	spinlock_t i_block_reservation_lock;
 
 	/*
 	 * Transactions that contain inode's metadata needed to complete
@@ -917,11 +908,20 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
-#define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
-#define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
+						~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
+						EXT4_MOUNT_##opt
 #define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
 					 EXT4_MOUNT_##opt)
 
+#define clear_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 &= \
+						~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 |= \
+						EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
+					 EXT4_MOUNT2_##opt)
+
 #define ext4_set_bit			ext2_set_bit
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
 #define ext4_clear_bit			ext2_clear_bit
@@ -1087,6 +1087,7 @@ struct ext4_sb_info {
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head **s_group_desc;
 	unsigned int s_mount_opt;
+	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
@@ -1237,24 +1238,39 @@ enum {
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
+	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
 };
 
-#define EXT4_INODE_BIT_FNS(name, field)					\
+#define EXT4_INODE_BIT_FNS(name, field, offset)				\
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
 {									\
-	return test_bit(bit, &EXT4_I(inode)->i_##field);		\
+	return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);	\
 }									\
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
 {									\
-	set_bit(bit, &EXT4_I(inode)->i_##field);			\
+	set_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
 }									\
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {									\
-	clear_bit(bit, &EXT4_I(inode)->i_##field);			\
+	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
 }
 
-EXT4_INODE_BIT_FNS(flag, flags)
-EXT4_INODE_BIT_FNS(state, state_flags)
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	(ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	/* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1642,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+				  struct file *,
 				  struct ext4_dir_entry_2 *,
 				  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
-	__ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)			\
+	unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+					(de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
@@ -1653,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1752,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
 			     ext4_fsblk_t, const char *, ...)
 	__attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-			    const char *, ...)
-	__attribute__ ((format (printf, 4, 5)));
+			    ext4_fsblk_t, const char *, ...)
+	__attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
 			     unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -2046,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd..2e29abb30f7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
  * structure for external API
  */
 
-#define EXT4_EXT_CACHE_NO	0
-#define EXT4_EXT_CACHE_GAP	1
-#define EXT4_EXT_CACHE_EXTENT	2
-
 /*
  * to be called by ext4_ext_walk_space()
  * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-	EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+	EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 }
 
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-					 sector_t lblocks);
+					 ext4_lblk_t lblocks);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 						   int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c..d8b992e658c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
 	if (ext4_handle_valid(handle))
-		return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
 	return 0;
 }
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1f..63a75810b7c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 		struct ext4_extent *ex;
 		depth = path->p_depth;
 
-		/* try to predict block placement */
+		/*
+		 * Try to predict block placement assuming that we are
+		 * filling in a file which will eventually be
+		 * non-sparse --- i.e., in the case of libbfd writing
+		 * an ELF object sections out-of-order but in a way
+		 * the eventually results in a contiguous object or
+		 * executable file, or some database extending a table
+		 * space file.  However, this is actually somewhat
+		 * non-ideal if we are writing a sparse file such as
+		 * qemu or KVM writing a raw image file that is going
+		 * to stay fairly sparse, since it will end up
+		 * fragmenting the file system's free space.  Maybe we
+		 * should have some hueristics or some way to allow
+		 * userspace to pass a hint to file system,
+		 * especiially if the latter case turns out to be
+		 * common.
+		 */
 		ex = path[depth].p_ext;
-		if (ex)
-			return (ext4_ext_pblock(ex) +
-				(block - le32_to_cpu(ex->ee_block)));
+		if (ex) {
+			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+			if (block > ext_block)
+				return ext_pblk + (block - ext_block);
+			else
+				return ext_pblk - (ext_block - block);
+		}
 
 		/* it looks like index is empty;
 		 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
  * to allocate @blocks
  * Worse case is one block per extent
  */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 			cbex.ec_block = start;
 			cbex.ec_len = end - start;
 			cbex.ec_start = 0;
-			cbex.ec_type = EXT4_EXT_CACHE_GAP;
 		} else {
 			cbex.ec_block = le32_to_cpu(ex->ee_block);
 			cbex.ec_len = ext4_ext_get_actual_len(ex);
 			cbex.ec_start = ext4_ext_pblock(ex);
-			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
 
 		if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-			__u32 len, ext4_fsblk_t start, int type)
+			__u32 len, ext4_fsblk_t start)
 {
 	struct ext4_ext_cache *cex;
 	BUG_ON(len == 0);
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
-	cex->ec_type = type;
 	cex->ec_block = block;
 	cex->ec_len = len;
 	cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 	}
 
 	ext_debug(" -> %u:%lu\n", lblock, len);
-	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+	ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
 
+/*
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
 static int
 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 			struct ext4_extent *ex)
 {
 	struct ext4_ext_cache *cex;
-	int ret = EXT4_EXT_CACHE_NO;
+	int ret = 0;
 
 	/*
 	 * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 	cex = &EXT4_I(inode)->i_cached_extent;
 
 	/* has cache valid data? */
-	if (cex->ec_type == EXT4_EXT_CACHE_NO)
+	if (cex->ec_len == 0)
 		goto errout;
 
-	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
 	if (in_range(block, cex->ec_block, cex->ec_len)) {
 		ex->ee_block = cpu_to_le32(cex->ec_block);
 		ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 		ext_debug("%u cached by %u:%u:%llu\n",
 				block,
 				cex->ec_block, cex->ec_len, cex->ec_start);
-		ret = cex->ec_type;
+		ret = 1;
 	}
 errout:
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2825,14 +2845,14 @@ fix_extent_len:
  * to an uninitialized extent.
  *
  * Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
  * There are three possibilities:
  *   a> There is no split required: Entire extent should be uninitialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
  * One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
  * the IO. The uninitialized extent called at this time will be split
  * into three uninitialized extent(at most). After IO complete, the part
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
  * Handle EOFBLOCKS_FL flag, clearing it if necessary
  */
 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
-			      struct ext4_map_blocks *map,
+			      ext4_lblk_t lblk,
 			      struct ext4_ext_path *path,
 			      unsigned int len)
 {
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 	 * this turns out to be false, we can bail out from this
 	 * function immediately.
 	 */
-	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
 	    ext4_ext_get_actual_len(last_ex))
 		return 0;
 	/*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 							path);
 		if (ret >= 0) {
 			ext4_update_inode_fsync_trans(handle, inode, 1);
-			err = check_eofblocks_fl(handle, inode, map, path,
-						 map->m_len);
+			err = check_eofblocks_fl(handle, inode, map->m_lblk,
+						 path, map->m_len);
 		} else
 			err = ret;
 		goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
 	if (ret >= 0) {
 		ext4_update_inode_fsync_trans(handle, inode, 1);
-		err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+					 map->m_len);
 		if (err < 0)
 			goto out2;
 	}
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex;
 	ext4_fsblk_t newblock;
-	int err = 0, depth, ret, cache_type;
+	int err = 0, depth, ret;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		  map->m_lblk, map->m_len, inode->i_ino);
 
 	/* check in cache */
-	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
-	if (cache_type) {
-		if (cache_type == EXT4_EXT_CACHE_GAP) {
+	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+		if (!newex.ee_start_lo && !newex.ee_start_hi) {
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
 				 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 				goto out2;
 			}
 			/* we should allocate requested block */
-		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+		} else {
 			/* block is already allocated */
 			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			allocated = ext4_ext_get_actual_len(&newex) -
 				(map->m_lblk - le32_to_cpu(newex.ee_block));
 			goto out;
-		} else {
-			BUG();
 		}
 	}
 
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			/* Do not put uninitialized extent in the cache */
 			if (!ext4_ext_is_uninitialized(ex)) {
 				ext4_ext_put_in_cache(inode, ee_block,
-							ee_len, ee_start,
-							EXT4_EXT_CACHE_EXTENT);
+							ee_len, ee_start);
 				goto out;
 			}
 			ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
 	if (err)
 		goto out2;
 
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * when it is _not_ an uninitialized extent.
 	 */
 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
-						EXT4_EXT_CACHE_EXTENT);
+		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 	} else
 		ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
 	int err = 0;
 
 	/*
+	 * finish any pending end_io work so we won't run the risk of
+	 * converting any truncated blocks to initialized later
+	 */
+	ext4_flush_completed_IO(inode);
+
+	/*
 	 * probably first extent we're gonna free will be last in block
 	 */
 	err = ext4_writepage_trans_blocks(inode);
@@ -3605,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
  * For block-mapped files, posix_fallocate should fall back to the method
  * of writing zeroes to the required new blocks (the same behavior which is
  * expected for file systems which do not support fallocate() system call).
  */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	handle_t *handle;
 	loff_t new_size;
 	unsigned int max_blocks;
@@ -3622,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
 	/*
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
@@ -3629,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
-	/* preallocation to directories is currently not supported */
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
@@ -3767,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 
 	logical =  (__u64)newex->ec_block << blksize_bits;
 
-	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+	if (newex->ec_start == 0) {
 		pgoff_t offset;
 		struct page *page;
 		struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddcee..2e8322c8aa8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
 	struct super_block *sb = inode->i_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct vfsmount *mnt = filp->f_path.mnt;
 	struct path path;
 	char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 			ext4_mark_super_dirty(sb);
 		}
 	}
+	/*
+	 * Set up the jbd2_inode if we are opening the inode for
+	 * writing and the journal is present
+	 */
+	if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+		struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+
+		spin_lock(&inode->i_lock);
+		if (!ei->jinode) {
+			if (!jinode) {
+				spin_unlock(&inode->i_lock);
+				return -ENOMEM;
+			}
+			ei->jinode = jinode;
+			jbd2_journal_init_jbd_inode(ei->jinode, inode);
+			jinode = NULL;
+		}
+		spin_unlock(&inode->i_lock);
+		if (unlikely(jinode != NULL))
+			jbd2_free_inode(jinode);
+	}
 	return dquot_file_open(inode, filp);
 }
 
@@ -188,6 +210,7 @@ const struct file_operations ext4_file_operations = {
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= ext4_fallocate,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
@@ -201,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.check_acl	= ext4_check_acl,
-	.fallocate	= ext4_fallocate,
 	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf..7829b287822 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
  * to written.
  * The function return the number of pending IOs on success.
  */
-static int flush_completed_IO(struct inode *inode)
+extern int ext4_flush_completed_IO(struct inode *inode)
 {
 	ext4_io_end_t *io;
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = flush_completed_IO(inode);
+	ret = ext4_flush_completed_IO(inode);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23eb..eb9097aec6f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state_flags = 0;
+	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
 	ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e659597b690..9f7f9e49914 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -54,10 +56,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
 	trace_ext4_begin_ordered_truncate(inode, new_size);
-	return jbd2_journal_begin_ordered_truncate(
-					EXT4_SB(inode->i_sb)->s_journal,
-					&EXT4_I(inode)->jinode,
-					new_size);
+	/*
+	 * If jinode is zero, then we never opened the file for
+	 * writing, so there's no need to call
+	 * jbd2_journal_begin_ordered_truncate() since there's no
+	 * outstanding writes we need to flush.
+	 */
+	if (!EXT4_I(inode)->jinode)
+		return 0;
+	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+						   EXT4_I(inode)->jinode,
+						   new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -552,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 
 /**
- *	ext4_blks_to_allocate: Look up the block map and count the number
+ *	ext4_blks_to_allocate - Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
@@ -591,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 
 /**
  *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *	@handle: handle for this transaction
+ *	@inode: inode which needs allocated blocks
+ *	@iblock: the logical block to start allocated at
+ *	@goal: preferred physical block of allocation
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
- *
+ *	@blks: number of desired blocks
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
- *	@blks:	on return it will store the total number of allocated
- *		direct blocks
+ *	@err: on return it will store the error code
+ *
+ *	This function will return the number of blocks allocated as
+ *	requested by the passed-in parameters.
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			     ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -711,9 +726,11 @@ failed_out:
 
 /**
  *	ext4_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
@@ -826,6 +843,7 @@ failed:
 
 /**
  * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
  * @inode: owner
  * @block: (logical) number of block we are adding
  * @chain: chain of indirect blocks (with a missing link - see
@@ -1081,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
  * Calculate the number of metadata blocks need to reserve
  * to allocate a block located at @lblock
  */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1320,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * avoid double accounting
 	 */
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
@@ -1350,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1878,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
  * Reserve a single block located at lblock
  */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2239,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 	 * affects functions in many different parts of the allocation
 	 * call path.  This flag exists primarily because we don't
 	 * want to change *many* call functions, so ext4_map_blocks()
-	 * will set the magic i_delalloc_reserved_flag once the
+	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
 	 * inode's allocation semaphore is taken.
 	 *
 	 * If the blocks in questions were delalloc blocks, set
@@ -3362,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * doing I/O at all.
 	 *
 	 * We could call write_cache_pages(), and then redirty all of
-	 * the pages by calling redirty_page_for_writeback() but that
+	 * the pages by calling redirty_page_for_writepage() but that
 	 * would be ugly in the extreme.  So instead we would need to
 	 * replicate parts of the code in the above functions,
 	 * simplifying them becuase we wouldn't actually intend to
@@ -3720,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
 	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
 	if (!io_end) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "%s: allocation fail\n", __func__);
+		pr_warn_ratelimited("%s: allocation fail\n", __func__);
 		schedule();
 		goto retry;
 	}
@@ -3745,9 +3762,9 @@ retry:
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
  * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
  *
  * The unwrritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
@@ -4045,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
-		if (ext4_should_order_data(inode))
+		if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
 			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
@@ -4169,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
 	__le32 *p;
 	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+	int	err;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4184,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-			ext4_handle_dirty_metadata(handle, inode, bh);
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
+			if (unlikely(err)) {
+				ext4_std_error(inode->i_sb, err);
+				return 1;
+			}
+		}
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (unlikely(err)) {
+			ext4_std_error(inode->i_sb, err);
+			return 1;
+		}
+		err = ext4_truncate_restart_trans(handle, inode,
+						  blocks_for_truncate(inode));
+		if (unlikely(err)) {
+			ext4_std_error(inode->i_sb, err);
+			return 1;
 		}
-		ext4_mark_inode_dirty(handle, inode);
-		ext4_truncate_restart_trans(handle, inode,
-					    blocks_for_truncate(inode));
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			ext4_journal_get_write_access(handle, bh);
@@ -4349,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 					(__le32 *) bh->b_data,
 					(__le32 *) bh->b_data + addr_per_block,
 					depth);
+			brelse(bh);
 
 			/*
 			 * Everything below this this pointer has been
@@ -4859,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 
-	ei->i_state_flags = 0;
+	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -5118,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d5..851f49b2f9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
 static inline int ext4_issue_discard(struct super_block *sb,
 		ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-	int ret;
 	ext4_fsblk_t discard_block;
 
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-	if (ret == -EOPNOTSUPP) {
-		ext4_warning(sb, "discard not supported, disabling");
-		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-	}
-	return ret;
+	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 }
 
 /*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	struct super_block *sb = journal->j_private;
 	struct ext4_buddy e4b;
 	struct ext4_group_info *db;
-	int err, count = 0, count2 = 0;
+	int err, ret, count = 0, count2 = 0;
 	struct ext4_free_data *entry;
 	struct list_head *l, *ltmp;
 
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, entry->group,
+		if (test_opt(sb, DISCARD)) {
+			ret = ext4_issue_discard(sb, entry->group,
 					entry->start_blk, entry->count);
+			if (unlikely(ret == -EOPNOTSUPP)) {
+				ext4_warning(sb, "discard not supported, "
+						 "disabling");
+				clear_opt(sb, DISCARD);
+			}
+		}
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
 	}
 }
 
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-					struct ext4_buddy *e4b,
-					sector_t block, int count)
-{
-	BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	 * EDQUOT check, as blocks and quotas have been already
 	 * reserved when data being copied into pagecache.
 	 */
-	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+	if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
 		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
 	else {
 		/* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
 	if (inquota && ar->len < inquota)
 		dquot_free_block(ar->inode, inquota - ar->len);
 	if (!ar->len) {
-		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		if (!ext4_test_inode_state(ar->inode,
+					   EXT4_STATE_DELALLOC_RESERVED))
 			/* release all the reserved blocks if non delalloc */
 			percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 						reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
 		 * blocks being freed are metadata. these blocks shouldn't
 		 * be used until this transaction is committed
 		 */
-		new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		if (!new_entry) {
+			err = -ENOMEM;
+			goto error_return;
+		}
 		new_entry->start_blk = bit;
 		new_entry->group  = block_group;
 		new_entry->count = count;
@@ -4643,7 +4635,6 @@ do_more:
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
-		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 	}
 
 	ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4718,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 	ext4_unlock_group(sb, group);
 
 	ret = ext4_issue_discard(sb, group, start, count);
-	if (ret)
-		ext4_std_error(sb, ret);
 
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4819,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
 	ext4_grpblk_t cnt = 0, first_block, last_block;
 	uint64_t start, len, minlen, trimmed;
+	ext4_fsblk_t first_data_blk =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 	int ret = 0;
 
 	start = range->start >> sb->s_blocksize_bits;
@@ -4828,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
 	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
 		return -EINVAL;
+	if (start < first_data_blk) {
+		len -= first_data_blk - start;
+		start = first_data_blk;
+	}
 
 	/* Determine first and last group to examine based on start and len */
 	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4851,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		if (len >= EXT4_BLOCKS_PER_GROUP(sb))
 			len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
 		else
-			last_block = len;
+			last_block = first_block + len;
 
 		if (e4b.bd_info->bb_free >= minlen) {
 			cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b72..b0a126f23c2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
 	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
 		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
 	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-				   S_IFREG, 0, goal);
+				   S_IFREG, NULL, goal);
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index dc40e75cba8..5485390d32c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
 	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-		if (!ext4_check_dir_entry(dir, de, bh,
-					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-						+((char *)de - bh->b_data))) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+					 + ((char *)de - bh->b_data))) {
 			/* On error, skip the f_pos to the next block. */
 			dir_file->f_pos = (dir_file->f_pos |
 					(dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
 		if ((char *) de + namelen <= dlimit &&
 		    ext4_match (namelen, name, de)) {
 			/* found a match - just to be sure, do a full check */
-			if (!ext4_check_dir_entry(dir, de, bh, offset))
+			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
 				return -1;
 			*res_dir = de;
 			return 1;
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
-		if (unlikely(IS_ERR(inode))) {
+		if (IS_ERR(inode)) {
 			if (PTR_ERR(inode) == -ESTALE) {
 				EXT4_ERROR_INODE(dir,
 						 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 		de = (struct ext4_dir_entry_2 *)bh->b_data;
 		top = bh->b_data + blocksize - reclen;
 		while ((char *) de <= top) {
-			if (!ext4_check_dir_entry(dir, de, bh, offset))
+			if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
 				return -EIO;
 			if (ext4_match(namelen, name, de))
 				return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+		err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+		if (err) {
+			ext4_std_error(inode->i_sb, err);
+			goto cleanup;
+		}
 	}
 	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
 	if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
 	struct ext4_dir_entry_2 *de, *pde;
 	unsigned int blocksize = dir->i_sb->s_blocksize;
-	int i;
+	int i, err;
 
 	i = 0;
 	pde = NULL;
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	while (i < bh->b_size) {
-		if (!ext4_check_dir_entry(dir, de, bh, i))
+		if (ext4_check_dir_entry(dir, NULL, de, bh, i))
 			return -EIO;
 		if (de == de_del)  {
 			BUFFER_TRACE(bh, "get_write_access");
-			ext4_journal_get_write_access(handle, bh);
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err)) {
+				ext4_std_error(dir->i_sb, err);
+				return err;
+			}
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
 					ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
 				de->inode = 0;
 			dir->i_version++;
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-			ext4_handle_dirty_metadata(handle, dir, bh);
+			err = ext4_handle_dirty_metadata(handle, dir, bh);
+			if (unlikely(err)) {
+				ext4_std_error(dir->i_sb, err);
+				return err;
+			}
 			return 0;
 		}
 		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	handle_t *handle;
 	struct inode *inode;
-	struct buffer_head *dir_block;
+	struct buffer_head *dir_block = NULL;
 	struct ext4_dir_entry_2 *de;
 	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
 	if (!dir_block)
 		goto out_clear_inode;
 	BUFFER_TRACE(dir_block, "get_write_access");
-	ext4_journal_get_write_access(handle, dir_block);
+	err = ext4_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out_clear_inode;
 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-	ext4_handle_dirty_metadata(handle, dir, dir_block);
-	brelse(dir_block);
-	ext4_mark_inode_dirty(handle, inode);
-	err = ext4_add_entry(handle, dentry, inode);
+	err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+	if (err)
+		goto out_clear_inode;
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext4_add_entry(handle, dentry, inode);
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
 	}
 	ext4_inc_count(handle, dir);
 	ext4_update_dx_flag(dir);
-	ext4_mark_inode_dirty(handle, dir);
+	err = ext4_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 out_stop:
+	brelse(dir_block);
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
 			}
 			de = (struct ext4_dir_entry_2 *) bh->b_data;
 		}
-		if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+		if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
 			de = (struct ext4_dir_entry_2 *)(bh->b_data +
 							 sb->s_blocksize);
 			offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 					ext4_current_time(new_dir);
 		ext4_mark_inode_dirty(handle, new_dir);
 		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-		ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+		retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+		if (unlikely(retval)) {
+			ext4_std_error(new_dir->i_sb, retval);
+			goto end_rename;
+		}
 		brelse(new_bh);
 		new_bh = NULL;
 	}
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
 						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-		ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+		retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+		if (retval) {
+			ext4_std_error(old_dir->i_sb, retval);
+			goto end_rename;
+		}
 		ext4_dec_count(handle, old_dir);
 		if (new_inode) {
 			/* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index beacce11ac5..7270dcfca92 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -44,7 +44,7 @@ int __init ext4_init_pageio(void)
 	if (io_page_cachep == NULL)
 		return -ENOMEM;
 	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-	if (io_page_cachep == NULL) {
+	if (io_end_cachep == NULL) {
 		kmem_cache_destroy(io_page_cachep);
 		return -ENOMEM;
 	}
@@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work)
 
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
-	ext4_io_end_t *io = NULL;
-
-	io = kmem_cache_alloc(io_end_cachep, flags);
+	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
 	if (io) {
-		memset(io, 0, sizeof(*io));
 		atomic_inc(&EXT4_I(inode)->i_ioend_count);
 		io->inode = inode;
 		INIT_WORK(&io->work, ext4_end_io_work);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 981c8477ada..3ecc6e45d2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
 		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
 		set_buffer_uptodate(gdb);
 		unlock_buffer(gdb);
-		ext4_handle_dirty_metadata(handle, NULL, gdb);
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+		if (unlikely(err)) {
+			brelse(gdb);
+			goto exit_bh;
+		}
 		ext4_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -258,7 +262,11 @@ static int setup_new_group_blocks(struct super_block *sb,
 
 	ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
 			     bh->b_data);
-	ext4_handle_dirty_metadata(handle, NULL, bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_bh;
+	}
 	brelse(bh);
 	/* Mark unused entries in inode bitmap used */
 	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -270,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
 
 	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			     bh->b_data);
-	ext4_handle_dirty_metadata(handle, NULL, bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	if (unlikely(err))
+		ext4_std_error(sb, err);
 exit_bh:
 	brelse(bh);
 
@@ -422,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		goto exit_dind;
 	}
 
-	if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (unlikely(err))
 		goto exit_dind;
 
-	if ((err = ext4_journal_get_write_access(handle, *primary)))
+	err = ext4_journal_get_write_access(handle, *primary);
+	if (unlikely(err))
 		goto exit_sbh;
 
-	if ((err = ext4_journal_get_write_access(handle, dind)))
-		goto exit_primary;
+	err = ext4_journal_get_write_access(handle, dind);
+	if (unlikely(err))
+		ext4_std_error(sb, err);
 
 	/* ext4_reserve_inode_write() gets a reference on the iloc */
-	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (unlikely(err))
 		goto exit_dindj;
 
 	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -454,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	 * reserved inode, and will become GDT blocks (primary and backup).
 	 */
 	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-	ext4_handle_dirty_metadata(handle, NULL, dind);
-	brelse(dind);
+	err = ext4_handle_dirty_metadata(handle, NULL, dind);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
 	ext4_mark_iloc_dirty(handle, inode, &iloc);
 	memset((*primary)->b_data, 0, sb->s_blocksize);
-	ext4_handle_dirty_metadata(handle, NULL, *primary);
+	err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
+	brelse(dind);
 
 	o_group_desc = EXT4_SB(sb)->s_group_desc;
 	memcpy(n_group_desc, o_group_desc,
@@ -470,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	kfree(o_group_desc);
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+	err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+	if (err)
+		ext4_std_error(sb, err);
 
-	return 0;
+	return err;
 
 exit_inode:
 	/* ext4_journal_release_buffer(handle, iloc.bh); */
 	brelse(iloc.bh);
 exit_dindj:
 	/* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
-	/* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-	/* ext4_journal_release_buffer(handle, *primary); */
+	/* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
 	brelse(dind);
 exit_bh:
@@ -665,7 +687,9 @@ static void update_backups(struct super_block *sb,
 			memset(bh->b_data + size, 0, rest);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
-		ext4_handle_dirty_metadata(handle, NULL, bh);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			ext4_std_error(sb, err);
 		brelse(bh);
 	}
 	if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -883,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
 
-	ext4_handle_dirty_metadata(handle, NULL, primary);
+	err = ext4_handle_dirty_metadata(handle, NULL, primary);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_journal;
+	}
 
 	/* Update the reserved block counts only once the new group is
 	 * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cd37f9d5e44..48ce561fafa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
 		  unsigned int line, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
-	       sb->s_id, function, line, current->comm);
-	vprintk(fmt, args);
-	printk("\n");
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+	       sb->s_id, function, line, current->comm, &vaf);
 	va_end(args);
 
 	ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
 		      const char *fmt, ...)
 {
 	va_list args;
+	struct va_format vaf;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	save_error_info(inode->i_sb, function, line);
 	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
 	       inode->i_sb->s_id, function, line, inode->i_ino);
 	if (block)
-		printk("block %llu: ", block);
-	printk("comm %s: ", current->comm);
-	vprintk(fmt, args);
-	printk("\n");
+		printk(KERN_CONT "block %llu: ", block);
+	printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
 	va_end(args);
 
 	ext4_handle_error(inode->i_sb);
 }
 
 void ext4_error_file(struct file *file, const char *function,
-		     unsigned int line, const char *fmt, ...)
+		     unsigned int line, ext4_fsblk_t block,
+		     const char *fmt, ...)
 {
 	va_list args;
+	struct va_format vaf;
 	struct ext4_super_block *es;
 	struct inode *inode = file->f_dentry->d_inode;
 	char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	save_error_info(inode->i_sb, function, line);
-	va_start(args, fmt);
 	path = d_path(&(file->f_path), pathname, sizeof(pathname));
-	if (!path)
+	if (IS_ERR(path))
 		path = "(unknown)";
 	printk(KERN_CRIT
-	       "EXT4-fs error (device %s): %s:%d: inode #%lu "
-	       "(comm %s path %s): ",
-	       inode->i_sb->s_id, function, line, inode->i_ino,
-	       current->comm, path);
-	vprintk(fmt, args);
-	printk("\n");
+	       "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
+	       inode->i_sb->s_id, function, line, inode->i_ino);
+	if (block)
+		printk(KERN_CONT "block %llu: ", block);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
 	va_end(args);
 
 	ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
 		panic("EXT4-fs panic from previous error\n");
 }
 
-void ext4_msg (struct super_block * sb, const char *prefix,
-		   const char *fmt, ...)
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
-	vprintk(fmt, args);
-	printk("\n");
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 	va_end(args);
 }
 
 void __ext4_warning(struct super_block *sb, const char *function,
 		    unsigned int line, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
-	       sb->s_id, function, line);
-	vprintk(fmt, args);
-	printk("\n");
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+	       sb->s_id, function, line, &vaf);
 	va_end(args);
 }
 
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+	struct va_format vaf;
 	va_list args;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
 	es->s_last_error_ino = cpu_to_le32(ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	__save_error_info(sb, function, line);
+
 	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
 	       sb->s_id, function, line, grp);
 	if (ino)
-		printk("inode %lu: ", ino);
+		printk(KERN_CONT "inode %lu: ", ino);
 	if (block)
-		printk("block %llu:", (unsigned long long) block);
-	vprintk(fmt, args);
-	printk("\n");
+		printk(KERN_CONT "block %llu:", (unsigned long long) block);
+	printk(KERN_CONT "%pV\n", &vaf);
 	va_end(args);
 
 	if (test_opt(sb, ERRORS_CONT)) {
@@ -647,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -663,8 +673,7 @@ fail:
  */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -808,21 +817,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
-	/*
-	 * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-	 * therefore it can be null here.  Don't check it, just initialize
-	 * jinode.
-	 */
-	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_da_metadata_calc_len = 0;
-	ei->i_delalloc_reserved_flag = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
+	ei->jinode = NULL;
 	INIT_LIST_HEAD(&ei->i_completed_io_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->cur_aio_dio = NULL;
@@ -898,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
 	end_writeback(inode);
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
-	if (EXT4_JOURNAL(inode))
-		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
-				       &EXT4_I(inode)->jinode);
+	if (EXT4_I(inode)->jinode) {
+		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+					       EXT4_I(inode)->jinode);
+		jbd2_free_inode(EXT4_I(inode)->jinode);
+		EXT4_I(inode)->jinode = NULL;
+	}
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1155,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-				char *path);
+			 struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1393,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 		sbi->s_qf_names[qtype] = NULL;
 		return 0;
 	}
-	set_opt(sbi->s_mount_opt, QUOTA);
+	set_opt(sb, QUOTA);
 	return 1;
 }
 
@@ -1448,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
 		switch (token) {
 		case Opt_bsd_df:
 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			clear_opt(sbi->s_mount_opt, MINIX_DF);
+			clear_opt(sb, MINIX_DF);
 			break;
 		case Opt_minix_df:
 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			set_opt(sbi->s_mount_opt, MINIX_DF);
+			set_opt(sb, MINIX_DF);
 
 			break;
 		case Opt_grpid:
 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			set_opt(sbi->s_mount_opt, GRPID);
+			set_opt(sb, GRPID);
 
 			break;
 		case Opt_nogrpid:
 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			clear_opt(sbi->s_mount_opt, GRPID);
+			clear_opt(sb, GRPID);
 
 			break;
 		case Opt_resuid:
@@ -1480,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
 			/* *sb_block = match_int(&args[0]); */
 			break;
 		case Opt_err_panic:
-			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt(sbi->s_mount_opt, ERRORS_RO);
-			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			clear_opt(sb, ERRORS_CONT);
+			clear_opt(sb, ERRORS_RO);
+			set_opt(sb, ERRORS_PANIC);
 			break;
 		case Opt_err_ro:
-			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt(sbi->s_mount_opt, ERRORS_RO);
+			clear_opt(sb, ERRORS_CONT);
+			clear_opt(sb, ERRORS_PANIC);
+			set_opt(sb, ERRORS_RO);
 			break;
 		case Opt_err_cont:
-			clear_opt(sbi->s_mount_opt, ERRORS_RO);
-			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sb, ERRORS_RO);
+			clear_opt(sb, ERRORS_PANIC);
+			set_opt(sb, ERRORS_CONT);
 			break;
 		case Opt_nouid32:
-			set_opt(sbi->s_mount_opt, NO_UID32);
+			set_opt(sb, NO_UID32);
 			break;
 		case Opt_debug:
-			set_opt(sbi->s_mount_opt, DEBUG);
+			set_opt(sb, DEBUG);
 			break;
 		case Opt_oldalloc:
-			set_opt(sbi->s_mount_opt, OLDALLOC);
+			set_opt(sb, OLDALLOC);
 			break;
 		case Opt_orlov:
-			clear_opt(sbi->s_mount_opt, OLDALLOC);
+			clear_opt(sb, OLDALLOC);
 			break;
 #ifdef CONFIG_EXT4_FS_XATTR
 		case Opt_user_xattr:
-			set_opt(sbi->s_mount_opt, XATTR_USER);
+			set_opt(sb, XATTR_USER);
 			break;
 		case Opt_nouser_xattr:
-			clear_opt(sbi->s_mount_opt, XATTR_USER);
+			clear_opt(sb, XATTR_USER);
 			break;
 #else
 		case Opt_user_xattr:
@@ -1521,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 		case Opt_acl:
-			set_opt(sbi->s_mount_opt, POSIX_ACL);
+			set_opt(sb, POSIX_ACL);
 			break;
 		case Opt_noacl:
-			clear_opt(sbi->s_mount_opt, POSIX_ACL);
+			clear_opt(sb, POSIX_ACL);
 			break;
 #else
 		case Opt_acl:
@@ -1543,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
 					 "Cannot specify journal on remount");
 				return 0;
 			}
-			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+			set_opt(sb, UPDATE_JOURNAL);
 			break;
 		case Opt_journal_dev:
 			if (is_remount) {
@@ -1556,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
 			*journal_devnum = option;
 			break;
 		case Opt_journal_checksum:
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+			set_opt(sb, JOURNAL_CHECKSUM);
 			break;
 		case Opt_journal_async_commit:
-			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+			set_opt(sb, JOURNAL_ASYNC_COMMIT);
+			set_opt(sb, JOURNAL_CHECKSUM);
 			break;
 		case Opt_noload:
-			set_opt(sbi->s_mount_opt, NOLOAD);
+			set_opt(sb, NOLOAD);
 			break;
 		case Opt_commit:
 			if (match_int(&args[0], &option))
@@ -1606,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
 					return 0;
 				}
 			} else {
-				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+				clear_opt(sb, DATA_FLAGS);
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
 		case Opt_data_err_abort:
-			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			set_opt(sb, DATA_ERR_ABORT);
 			break;
 		case Opt_data_err_ignore:
-			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			clear_opt(sb, DATA_ERR_ABORT);
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
@@ -1654,12 +1660,12 @@ set_qf_format:
 			break;
 		case Opt_quota:
 		case Opt_usrquota:
-			set_opt(sbi->s_mount_opt, QUOTA);
-			set_opt(sbi->s_mount_opt, USRQUOTA);
+			set_opt(sb, QUOTA);
+			set_opt(sb, USRQUOTA);
 			break;
 		case Opt_grpquota:
-			set_opt(sbi->s_mount_opt, QUOTA);
-			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			set_opt(sb, QUOTA);
+			set_opt(sb, GRPQUOTA);
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_loaded(sb)) {
@@ -1667,9 +1673,9 @@ set_qf_format:
 					"options when quota turned on");
 				return 0;
 			}
-			clear_opt(sbi->s_mount_opt, QUOTA);
-			clear_opt(sbi->s_mount_opt, USRQUOTA);
-			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+			clear_opt(sb, QUOTA);
+			clear_opt(sb, USRQUOTA);
+			clear_opt(sb, GRPQUOTA);
 			break;
 #else
 		case Opt_quota:
@@ -1695,7 +1701,7 @@ set_qf_format:
 			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
 			break;
 		case Opt_nobarrier:
-			clear_opt(sbi->s_mount_opt, BARRIER);
+			clear_opt(sb, BARRIER);
 			break;
 		case Opt_barrier:
 			if (args[0].from) {
@@ -1704,9 +1710,9 @@ set_qf_format:
 			} else
 				option = 1;	/* No argument, default to 1 */
 			if (option)
-				set_opt(sbi->s_mount_opt, BARRIER);
+				set_opt(sb, BARRIER);
 			else
-				clear_opt(sbi->s_mount_opt, BARRIER);
+				clear_opt(sb, BARRIER);
 			break;
 		case Opt_ignore:
 			break;
@@ -1730,17 +1736,17 @@ set_qf_format:
 				 "Ignoring deprecated bh option");
 			break;
 		case Opt_i_version:
-			set_opt(sbi->s_mount_opt, I_VERSION);
+			set_opt(sb, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
 			break;
 		case Opt_nodelalloc:
-			clear_opt(sbi->s_mount_opt, DELALLOC);
+			clear_opt(sb, DELALLOC);
 			break;
 		case Opt_mblk_io_submit:
-			set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+			set_opt(sb, MBLK_IO_SUBMIT);
 			break;
 		case Opt_nomblk_io_submit:
-			clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+			clear_opt(sb, MBLK_IO_SUBMIT);
 			break;
 		case Opt_stripe:
 			if (match_int(&args[0], &option))
@@ -1750,13 +1756,13 @@ set_qf_format:
 			sbi->s_stripe = option;
 			break;
 		case Opt_delalloc:
-			set_opt(sbi->s_mount_opt, DELALLOC);
+			set_opt(sb, DELALLOC);
 			break;
 		case Opt_block_validity:
-			set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			set_opt(sb, BLOCK_VALIDITY);
 			break;
 		case Opt_noblock_validity:
-			clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			clear_opt(sb, BLOCK_VALIDITY);
 			break;
 		case Opt_inode_readahead_blks:
 			if (match_int(&args[0], &option))
@@ -1780,7 +1786,7 @@ set_qf_format:
 							    option);
 			break;
 		case Opt_noauto_da_alloc:
-			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+			set_opt(sb, NO_AUTO_DA_ALLOC);
 			break;
 		case Opt_auto_da_alloc:
 			if (args[0].from) {
@@ -1789,24 +1795,24 @@ set_qf_format:
 			} else
 				option = 1;	/* No argument, default to 1 */
 			if (option)
-				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+				clear_opt(sb, NO_AUTO_DA_ALLOC);
 			else
-				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+				set_opt(sb,NO_AUTO_DA_ALLOC);
 			break;
 		case Opt_discard:
-			set_opt(sbi->s_mount_opt, DISCARD);
+			set_opt(sb, DISCARD);
 			break;
 		case Opt_nodiscard:
-			clear_opt(sbi->s_mount_opt, DISCARD);
+			clear_opt(sb, DISCARD);
 			break;
 		case Opt_dioread_nolock:
-			set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			set_opt(sb, DIOREAD_NOLOCK);
 			break;
 		case Opt_dioread_lock:
-			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			clear_opt(sb, DIOREAD_NOLOCK);
 			break;
 		case Opt_init_inode_table:
-			set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			set_opt(sb, INIT_INODE_TABLE);
 			if (args[0].from) {
 				if (match_int(&args[0], &option))
 					return 0;
@@ -1817,7 +1823,7 @@ set_qf_format:
 			sbi->s_li_wait_mult = option;
 			break;
 		case Opt_noinit_inode_table:
-			clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			clear_opt(sb, INIT_INODE_TABLE);
 			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
@@ -1829,10 +1835,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-			clear_opt(sbi->s_mount_opt, USRQUOTA);
+			clear_opt(sb, USRQUOTA);
 
 		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+			clear_opt(sb, GRPQUOTA);
 
 		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
 			ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1902,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-				"bpg=%lu, ipg=%lu, mo=%04x]\n",
+				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT4_BLOCKS_PER_GROUP(sb),
 			EXT4_INODES_PER_GROUP(sb),
-			sbi->s_mount_opt);
+			sbi->s_mount_opt, sbi->s_mount_opt2);
 
 	return res;
 }
@@ -1937,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	size = flex_group_count * sizeof(struct flex_groups);
 	sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
-		sbi->s_flex_groups = vmalloc(size);
-		if (sbi->s_flex_groups)
-			memset(sbi->s_flex_groups, 0, size);
-	}
-	if (sbi->s_flex_groups == NULL) {
-		ext4_msg(sb, KERN_ERR, "not enough memory for "
-				"%u flex groups", flex_group_count);
-		goto failed;
+		sbi->s_flex_groups = vzalloc(size);
+		if (sbi->s_flex_groups == NULL) {
+			ext4_msg(sb, KERN_ERR,
+				 "not enough memory for %u flex groups",
+				 flex_group_count);
+			goto failed;
+		}
 	}
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2923,7 +2928,7 @@ static int ext4_register_li_request(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_li_request *elr;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-	int ret;
+	int ret = 0;
 
 	if (sbi->s_li_request != NULL)
 		return 0;
@@ -3078,41 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-	set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+	set_opt(sb, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
-		set_opt(sbi->s_mount_opt, DEBUG);
+		set_opt(sb, DEBUG);
 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
 		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
 			"2.6.38");
-		set_opt(sbi->s_mount_opt, GRPID);
+		set_opt(sb, GRPID);
 	}
 	if (def_mount_opts & EXT4_DEFM_UID16)
-		set_opt(sbi->s_mount_opt, NO_UID32);
+		set_opt(sb, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
 	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-		set_opt(sbi->s_mount_opt, XATTR_USER);
+		set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (def_mount_opts & EXT4_DEFM_ACL)
-		set_opt(sbi->s_mount_opt, POSIX_ACL);
+		set_opt(sb, POSIX_ACL);
 #endif
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-		set_opt(sbi->s_mount_opt, ORDERED_DATA);
+		set_opt(sb, ORDERED_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+		set_opt(sb, WRITEBACK_DATA);
 
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+		set_opt(sb, ERRORS_PANIC);
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+		set_opt(sb, ERRORS_CONT);
 	else
-		set_opt(sbi->s_mount_opt, ERRORS_RO);
+		set_opt(sb, ERRORS_RO);
 	if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-		set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+		set_opt(sb, BLOCK_VALIDITY);
 	if (def_mount_opts & EXT4_DEFM_DISCARD)
-		set_opt(sbi->s_mount_opt, DISCARD);
+		set_opt(sb, DISCARD);
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3121,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
 	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-		set_opt(sbi->s_mount_opt, BARRIER);
+		set_opt(sb, BARRIER);
 
 	/*
 	 * enable delayed allocation by default
@@ -3129,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	if (!IS_EXT3_SB(sb) &&
 	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-		set_opt(sbi->s_mount_opt, DELALLOC);
+		set_opt(sb, DELALLOC);
 
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
 			   &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3432,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       "suppressed and not mounted read-only");
 		goto failed_mount_wq;
 	} else {
-		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
-		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+		clear_opt(sb, DATA_FLAGS);
+		set_opt(sb, WRITEBACK_DATA);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
 		goto no_journal;
@@ -3471,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		 */
 		if (jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+			set_opt(sb, ORDERED_DATA);
 		else
-			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+			set_opt(sb, JOURNAL_DATA);
 		break;
 
 	case EXT4_MOUNT_ORDERED_DATA:
@@ -3563,18 +3568,18 @@ no_journal:
 	    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
 		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
 			 "requested data journaling mode");
-		clear_opt(sbi->s_mount_opt, DELALLOC);
+		clear_opt(sb, DELALLOC);
 	}
 	if (test_opt(sb, DIOREAD_NOLOCK)) {
 		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
 			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
 				"option - requested data journaling mode");
-			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			clear_opt(sb, DIOREAD_NOLOCK);
 		}
 		if (sb->s_blocksize < PAGE_SIZE) {
 			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
 				"option - block size is too small");
-			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			clear_opt(sb, DIOREAD_NOLOCK);
 		}
 	}
 
@@ -3772,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext4_msg(sb, KERN_ERR,
-			"failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
@@ -4173,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
 	return 0;
 }
 
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+	unsigned long s_mount_opt;
+	unsigned long s_mount_opt2;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct ext4_super_block *es;
@@ -4193,6 +4207,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4346,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4542,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name)
+			 struct path *path)
 {
 	int err;
-	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
-
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
-		path_put(&path);
+	if (path->mnt->mnt_sb != sb)
 		return -EXDEV;
-	}
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			ext4_msg(sb, KERN_WARNING,
 				"Quota file not on filesystem root. "
 				"Journaled quota will not work");
@@ -4573,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	 * all updates to the file when we bypass pagecache...
 	 */
 	if (EXT4_SB(sb)->s_journal &&
-	    ext4_should_journal_data(path.dentry->d_inode)) {
+	    ext4_should_journal_data(path->dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -4581,15 +4590,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-		if (err) {
-			path_put(&path);
+		if (err)
 			return err;
-		}
 	}
 
-	err = dquot_quota_on_path(sb, type, format_id, &path);
-	path_put(&path);
-	return err;
+	return dquot_quota_on(sb, type, format_id, path);
 }
 
 static int ext4_quota_off(struct super_block *sb, int type)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b..fc32176eee3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	int i_error, b_error;
+	int ret, ret2;
 
 	down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-	i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-	if (i_error < 0) {
-		b_error = 0;
-	} else {
-		if (buffer) {
-			buffer += i_error;
-			buffer_size -= i_error;
-		}
-		b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-		if (b_error < 0)
-			i_error = 0;
+	ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	if (buffer) {
+		buffer += ret;
+		buffer_size -= ret;
 	}
+	ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	ret += ret2;
+errout:
 	up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-	return i_error + b_error;
+	return ret;
 }
 
 /*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
  * ext4_xattr_set_handle()
  *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
  * is NULL to remove an existing extended attribute, and non-NULL to
  * either replace an existing extended attribute, or create a new extended
  * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c2..f50408901f7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
 			struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-			const struct inode_operations *fs_dir_inode_ops, int isvfat);
+			const struct inode_operations *fs_dir_inode_ops,
+			int isvfat, void (*setup)(struct super_block *));
 
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 206351af7c5..86753fe10bd 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -703,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 		struct fid *fid, int fh_len, int fh_type)
 {
 	struct inode *inode = NULL;
-	struct dentry *result;
 	u32 *fh = fid->raw;
 
 	if (fh_len < 5 || fh_type != 3)
@@ -748,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 	 * the fat_iget lookup again.  If that fails, then we are totally out
 	 * of luck.  But all that is for another day
 	 */
-	result = d_obtain_alias(inode);
-	if (!IS_ERR(result))
-		d_set_d_op(result, sb->s_root->d_op);
-	return result;
+	return d_obtain_alias(inode);
 }
 
 static int
@@ -799,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
 	brelse(bh);
 
 	parent = d_obtain_alias(inode);
-	if (!IS_ERR(parent))
-		d_set_d_op(parent, sb->s_root->d_op);
 out:
 	unlock_super(sb);
 
@@ -1244,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
  * Read the super block of an MS-DOS FS.
  */
 int fat_fill_super(struct super_block *sb, void *data, int silent,
-		   const struct inode_operations *fs_dir_inode_ops, int isvfat)
+		   const struct inode_operations *fs_dir_inode_ops, int isvfat,
+		   void (*setup)(struct super_block *))
 {
 	struct inode *root_inode = NULL, *fat_inode = NULL;
 	struct buffer_head *bh;
@@ -1280,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	if (error)
 		goto out_fail;
 
+	setup(sb); /* flavour-specific stuff that needs options */
+
 	error = -EIO;
 	sb_min_blocksize(sb, 512);
 	bh = sb_bread(sb, 0);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 35ffe43afa4..711499040eb 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -227,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 	}
 out:
 	unlock_super(sb);
-	d_set_d_op(dentry, &msdos_dentry_operations);
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry)
-		d_set_d_op(dentry, &msdos_dentry_operations);
-	return dentry;
+	return d_splice_alias(inode, dentry);
 
 error:
 	unlock_super(sb);
@@ -661,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = {
 	.getattr	= fat_getattr,
 };
 
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-	int res;
-
-	lock_super(sb);
-	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-	if (res) {
-		unlock_super(sb);
-		return res;
-	}
-
+	sb->s_d_op = &msdos_dentry_operations;
 	sb->s_flags |= MS_NOATIME;
-	d_set_d_op(sb->s_root, &msdos_dentry_operations);
-	unlock_super(sb);
-	return 0;
+}
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+	return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+			     0, setup);
 }
 
 static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index e3ffc5e1233..f88f752babd 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -772,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 
 out:
 	unlock_super(sb);
-	d_set_d_op(dentry, sb->s_root->d_op);
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
-	if (dentry) {
-		d_set_d_op(dentry, sb->s_root->d_op);
+	if (dentry)
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
-	}
 	return dentry;
 
 error:
@@ -1066,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = {
 	.getattr	= fat_getattr,
 };
 
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-	int res;
-
-	lock_super(sb);
-	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-	if (res) {
-		unlock_super(sb);
-		return res;
-	}
-
 	if (MSDOS_SB(sb)->options.name_check != 's')
-		d_set_d_op(sb->s_root, &vfat_ci_dentry_ops);
+		sb->s_d_op = &vfat_ci_dentry_ops;
 	else
-		d_set_d_op(sb->s_root, &vfat_dentry_ops);
+		sb->s_d_op = &vfat_dentry_ops;
+}
 
-	unlock_super(sb);
-	return 0;
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+			     1, setup);
 }
 
 static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b..c3e89adf53c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	struct files_struct *files = current->files;
 
 	*fput_needed = 0;
-	if (likely((atomic_read(&files->count) == 1))) {
+	if (atomic_read(&files->count) == 1) {
 		file = fcheck_files(files, fd);
 	} else {
 		rcu_read_lock();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953a..59c6e495678 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
 	return list_entry(head, struct inode, i_wb_list);
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-		struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-	trace_writeback_queue(bdi, work);
-
-	spin_lock_bh(&bdi->wb_lock);
-	list_add_tail(&work->list, &bdi->work_list);
 	if (bdi->wb.task) {
 		wake_up_process(bdi->wb.task);
 	} else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 		 * The bdi thread isn't there, wake up the forker thread which
 		 * will create and run it.
 		 */
-		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+			   struct wb_writeback_work *work)
+{
+	trace_writeback_queue(bdi, work);
+
+	spin_lock_bh(&bdi->wb_lock);
+	list_add_tail(&work->list, &bdi->work_list);
+	if (!bdi->wb.task)
+		trace_writeback_nothread(bdi, work);
+	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		bool range_cyclic, bool for_background)
+		      bool range_cyclic)
 {
 	struct wb_writeback_work *work;
 
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
-	work->for_background = for_background;
 
 	bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, false);
+	__bdi_start_writeback(bdi, nr_pages, true);
 }
 
 /**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
  * @bdi: the backing device to write from
  *
  * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
- *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-	__bdi_start_writeback(bdi, LONG_MAX, true, true);
+	/*
+	 * We just wake up the flusher thread. It will perform background
+	 * writeback as soon as there is no other work to do.
+	 */
+	trace_writeback_wake_background(bdi);
+	spin_lock_bh(&bdi->wb_lock);
+	bdi_wakeup_flusher(bdi);
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 /*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
+	long write_chunk;
 	struct inode *inode;
 
 	if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.range_end = LLONG_MAX;
 	}
 
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          __writeback_inodes_sb()     <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (wbc.sync_mode == WB_SYNC_NONE)
+		write_chunk = MAX_WRITEBACK_PAGES;
+	else
+		write_chunk = LONG_MAX;
+
 	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
 			break;
 
 		/*
+		 * Background writeout and kupdate-style writeback may
+		 * run forever. Stop them if there is other work to do
+		 * so that e.g. sync can proceed. They'll be restarted
+		 * after the other works are all done.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
+		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 			break;
 
 		wbc.more_io = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 			writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
-		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
 
 		/*
 		 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		/*
 		 * Did we write something? Try for more
 		 */
-		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+		if (wbc.nr_to_write < write_chunk)
 			continue;
 		/*
 		 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
 		get_nr_dirty_inodes();
 }
 
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+	if (over_bground_thresh()) {
+
+		struct wb_writeback_work work = {
+			.nr_pages	= LONG_MAX,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_background	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
 	unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	wrote += wb_check_background_flush(wb);
 	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, false);
+		__bdi_start_writeback(bdi, nr_pages, false);
 	}
 	rcu_read_unlock();
 }
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
  * @sb: the superblock
  *
  * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
  */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 EXPORT_SYMBOL(sync_inode);
 
 /**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
  * @inode: the inode to sync
  * @wait: wait for I/O to complete.
  *
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
  *
  * Note: only writes the actual inode, no associated data or other metadata.
  */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb..78b519c1353 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include "internal.h"
+
+static inline void path_get_longterm(struct path *path)
+{
+	path_get(path);
+	mnt_make_longterm(path->mnt);
+}
+
+static inline void path_put_longterm(struct path *path)
+{
+	mnt_make_shortterm(path->mnt);
+	path_put(path);
+}
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
-	path_get_long(path);
+	path_get_longterm(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 	if (old_root.dentry)
-		path_put_long(&old_root);
+		path_put_longterm(&old_root);
 }
 
 /*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
-	path_get_long(path);
+	path_get_longterm(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
-		path_put_long(&old_pwd);
+		path_put_longterm(&old_pwd);
 }
 
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 			write_seqcount_begin(&fs->seq);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
-				path_get_long(new_root);
+				path_get_longterm(new_root);
 				fs->root = *new_root;
 				count++;
 			}
 			if (fs->pwd.dentry == old_root->dentry
 			    && fs->pwd.mnt == old_root->mnt) {
-				path_get_long(new_root);
+				path_get_longterm(new_root);
 				fs->pwd = *new_root;
 				count++;
 			}
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	while (count--)
-		path_put_long(old_root);
+		path_put_longterm(old_root);
 }
 
 void free_fs_struct(struct fs_struct *fs)
 {
-	path_put_long(&fs->root);
-	path_put_long(&fs->pwd);
+	path_put_longterm(&fs->root);
+	path_put_longterm(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
 
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 		spin_lock(&old->lock);
 		fs->root = old->root;
-		path_get_long(&fs->root);
+		path_get_longterm(&fs->root);
 		fs->pwd = old->pwd;
-		path_get_long(&fs->pwd);
+		path_get_longterm(&fs->pwd);
 		spin_unlock(&old->lock);
 	}
 	return fs;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede0..48a18f184d5 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
 
-		if (object->n_ops > 0) {
+		if (object->n_ops > 1) {
 			atomic_inc(&op->usage);
 			list_add_tail(&op->pend_link, &object->pending_ops);
 			fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308d..cf8d28d1fba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+		       u64 nodeid, u64 nlookup)
+{
+	forget->forget_one.nodeid = nodeid;
+	forget->forget_one.nlookup = nlookup;
+
+	spin_lock(&fc->lock);
+	fc->forget_list_tail->next = forget;
+	fc->forget_list_tail = forget;
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	spin_unlock(&fc->lock);
+}
+
 static void flush_bg_queue(struct fuse_conn *fc)
 {
 	while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
-{
-	req->isreply = 0;
-	fuse_request_send_nowait(fc, req);
-}
-
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 	return err;
 }
 
+static int forget_pending(struct fuse_conn *fc)
+{
+	return fc->forget_list_head.next != NULL;
+}
+
 static int request_pending(struct fuse_conn *fc)
 {
-	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+		forget_pending(fc);
 }
 
 /* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
 	return err ? err : reqsize;
 }
 
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+					       unsigned max,
+					       unsigned *countp)
+{
+	struct fuse_forget_link *head = fc->forget_list_head.next;
+	struct fuse_forget_link **newhead = &head;
+	unsigned count;
+
+	for (count = 0; *newhead != NULL && count < max; count++)
+		newhead = &(*newhead)->next;
+
+	fc->forget_list_head.next = *newhead;
+	*newhead = NULL;
+	if (fc->forget_list_head.next == NULL)
+		fc->forget_list_tail = &fc->forget_list_head;
+
+	if (countp != NULL)
+		*countp = count;
+
+	return head;
+}
+
+static int fuse_read_single_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs,
+				   size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+	struct fuse_forget_in arg = {
+		.nlookup = forget->forget_one.nlookup,
+	};
+	struct fuse_in_header ih = {
+		.opcode = FUSE_FORGET,
+		.nodeid = forget->forget_one.nodeid,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	spin_unlock(&fc->lock);
+	kfree(forget);
+	if (nbytes < ih.len)
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+				   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+	int err;
+	unsigned max_forgets;
+	unsigned count;
+	struct fuse_forget_link *head;
+	struct fuse_batch_forget_in arg = { .count = 0 };
+	struct fuse_in_header ih = {
+		.opcode = FUSE_BATCH_FORGET,
+		.unique = fuse_get_unique(fc),
+		.len = sizeof(ih) + sizeof(arg),
+	};
+
+	if (nbytes < ih.len) {
+		spin_unlock(&fc->lock);
+		return -EINVAL;
+	}
+
+	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+	head = dequeue_forget(fc, max_forgets, &count);
+	spin_unlock(&fc->lock);
+
+	arg.count = count;
+	ih.len += count * sizeof(struct fuse_forget_one);
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+
+	while (head) {
+		struct fuse_forget_link *forget = head;
+
+		if (!err) {
+			err = fuse_copy_one(cs, &forget->forget_one,
+					    sizeof(forget->forget_one));
+		}
+		head = forget->next;
+		kfree(forget);
+	}
+
+	fuse_copy_finish(cs);
+
+	if (err)
+		return err;
+
+	return ih.len;
+}
+
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			    size_t nbytes)
+__releases(fc->lock)
+{
+	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+		return fuse_read_single_forget(fc, cs, nbytes);
+	else
+		return fuse_read_batch_forget(fc, cs, nbytes);
+}
+
 /*
  * Read a single request into the userspace filesystem's buffer.  This
  * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 		return fuse_read_interrupt(fc, cs, nbytes, req);
 	}
 
+	if (forget_pending(fc)) {
+		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+			return fuse_read_forget(fc, cs, nbytes);
+
+		if (fc->forget_batch <= -8)
+			fc->forget_batch = 16;
+	}
+
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	req->state = FUSE_REQ_READING;
 	list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	if (!fc)
 		return -EPERM;
 
-	bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
 
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	if (!fc)
 		return -EPERM;
 
-	bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
 
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
 	flush_bg_queue(fc);
 	end_requests(fc, &fc->pending);
 	end_requests(fc, &fc->processing);
+	while (forget_pending(fc))
+		kfree(dequeue_forget(fc, 1, NULL));
 }
 
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f738599fd8c..bfed8447ed8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
 
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -169,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		struct fuse_entry_out outarg;
 		struct fuse_conn *fc;
 		struct fuse_req *req;
-		struct fuse_req *forget_req;
+		struct fuse_forget_link *forget;
 		struct dentry *parent;
 		u64 attr_version;
 
@@ -182,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		if (IS_ERR(req))
 			return 0;
 
-		forget_req = fuse_get_req(fc);
-		if (IS_ERR(forget_req)) {
+		forget = fuse_alloc_forget();
+		if (!forget) {
 			fuse_put_request(fc, req);
 			return 0;
 		}
@@ -203,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		if (!err) {
 			struct fuse_inode *fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
-				fuse_send_forget(fc, forget_req,
-						 outarg.nodeid, 1);
+				fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 				return 0;
 			}
 			spin_lock(&fc->lock);
 			fi->nlookup++;
 			spin_unlock(&fc->lock);
 		}
-		fuse_put_request(fc, forget_req);
+		kfree(forget);
 		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
 			return 0;
 
@@ -263,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
-	struct fuse_req *forget_req;
+	struct fuse_forget_link *forget;
 	u64 attr_version;
 	int err;
 
@@ -277,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 	if (IS_ERR(req))
 		goto out;
 
-	forget_req = fuse_get_req(fc);
-	err = PTR_ERR(forget_req);
-	if (IS_ERR(forget_req)) {
+	forget = fuse_alloc_forget();
+	err = -ENOMEM;
+	if (!forget) {
 		fuse_put_request(fc, req);
 		goto out;
 	}
@@ -305,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 			   attr_version);
 	err = -ENOMEM;
 	if (!*inode) {
-		fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+		fuse_queue_forget(fc, forget, outarg->nodeid, 1);
 		goto out;
 	}
 	err = 0;
 
  out_put_forget:
-	fuse_put_request(fc, forget_req);
+	kfree(forget);
  out:
 	return err;
 }
@@ -351,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	}
 
 	entry = newent ? newent : entry;
-	d_set_d_op(entry, &fuse_dentry_operations);
 	if (outarg_valid)
 		fuse_change_entry_timeout(entry, &outarg);
 	else
@@ -378,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct inode *inode;
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req;
-	struct fuse_req *forget_req;
+	struct fuse_forget_link *forget;
 	struct fuse_create_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
@@ -392,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (flags & O_DIRECT)
 		return -EINVAL;
 
-	forget_req = fuse_get_req(fc);
-	if (IS_ERR(forget_req))
-		return PTR_ERR(forget_req);
+	forget = fuse_alloc_forget();
+	if (!forget)
+		return -ENOMEM;
 
 	req = fuse_get_req(fc);
 	err = PTR_ERR(req);
@@ -452,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		fuse_sync_release(ff, flags);
-		fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+		fuse_queue_forget(fc, forget, outentry.nodeid, 1);
 		return -ENOMEM;
 	}
-	fuse_put_request(fc, forget_req);
+	kfree(forget);
 	d_instantiate(entry, inode);
 	fuse_change_entry_timeout(entry, &outentry);
 	fuse_invalidate_attr(dir);
@@ -473,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
  out_put_request:
 	fuse_put_request(fc, req);
  out_put_forget_req:
-	fuse_put_request(fc, forget_req);
+	kfree(forget);
 	return err;
 }
 
@@ -487,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	struct fuse_entry_out outarg;
 	struct inode *inode;
 	int err;
-	struct fuse_req *forget_req;
+	struct fuse_forget_link *forget;
 
-	forget_req = fuse_get_req(fc);
-	if (IS_ERR(forget_req)) {
+	forget = fuse_alloc_forget();
+	if (!forget) {
 		fuse_put_request(fc, req);
-		return PTR_ERR(forget_req);
+		return -ENOMEM;
 	}
 
 	memset(&outarg, 0, sizeof(outarg));
@@ -519,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 			  &outarg.attr, entry_attr_timeout(&outarg), 0);
 	if (!inode) {
-		fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+		fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 		return -ENOMEM;
 	}
-	fuse_put_request(fc, forget_req);
+	kfree(forget);
 
 	if (S_ISDIR(inode->i_mode)) {
 		struct dentry *alias;
@@ -545,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	return 0;
 
  out_put_forget_req:
-	fuse_put_request(fc, forget_req);
+	kfree(forget);
 	return err;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8b984a2cebb..95da1bc1c82 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
  * and 64bit.  Fortunately we can determine which structure the server
  * used from the size of the reply.
  */
-static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
-				 size_t transferred, unsigned count,
-				 bool is_compat)
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+				     size_t transferred, unsigned count,
+				     bool is_compat)
 {
 #ifdef CONFIG_COMPAT
 	if (count * sizeof(struct compat_iovec) == transferred) {
@@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
 	return 0;
 }
 
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+				 void *src, size_t transferred, unsigned count,
+				 bool is_compat)
+{
+	unsigned i;
+	struct fuse_ioctl_iovec *fiov = src;
+
+	if (fc->minor < 16) {
+		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+						 count, is_compat);
+	}
+
+	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+		return -EIO;
+
+	for (i = 0; i < count; i++) {
+		/* Did the server supply an inappropriate value? */
+		if (fiov[i].base != (unsigned long) fiov[i].base ||
+		    fiov[i].len != (unsigned long) fiov[i].len)
+			return -EIO;
+
+		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+		dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+		if (is_compat &&
+		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+		     (compat_size_t) dst[i].iov_len != fiov[i].len))
+			return -EIO;
+#endif
+	}
+
+	return 0;
+}
+
+
 /*
  * For ioctls, there is no generic way to determine how much memory
  * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1740,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	struct fuse_ioctl_out outarg;
 	struct fuse_req *req = NULL;
 	struct page **pages = NULL;
-	struct page *iov_page = NULL;
+	struct iovec *iov_page = NULL;
 	struct iovec *in_iov = NULL, *out_iov = NULL;
 	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
 	size_t in_size, out_size, transferred;
 	int err;
 
+#if BITS_PER_LONG == 32
+	inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+	if (flags & FUSE_IOCTL_COMPAT)
+		inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+
 	/* assume all the iovs returned by client always fits in a page */
-	BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
 
 	err = -ENOMEM;
 	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
-	iov_page = alloc_page(GFP_KERNEL);
+	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
 	if (!pages || !iov_page)
 		goto out;
 
@@ -1760,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	 * RETRY from server is not allowed.
 	 */
 	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
-		struct iovec *iov = page_address(iov_page);
+		struct iovec *iov = iov_page;
 
 		iov->iov_base = (void __user *)arg;
 		iov->iov_len = _IOC_SIZE(cmd);
@@ -1841,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 
 	/* did it ask for retry? */
 	if (outarg.flags & FUSE_IOCTL_RETRY) {
-		char *vaddr;
+		void *vaddr;
 
 		/* no retry if in restricted mode */
 		err = -EIO;
@@ -1862,14 +1905,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 			goto out;
 
 		vaddr = kmap_atomic(pages[0], KM_USER0);
-		err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
 					    transferred, in_iovs + out_iovs,
 					    (flags & FUSE_IOCTL_COMPAT) != 0);
 		kunmap_atomic(vaddr, KM_USER0);
 		if (err)
 			goto out;
 
-		in_iov = page_address(iov_page);
+		in_iov = iov_page;
 		out_iov = in_iov + in_iovs;
 
 		err = fuse_verify_ioctl_iov(in_iov, in_iovs);
@@ -1891,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
  out:
 	if (req)
 		fuse_put_request(fc, req);
-	if (iov_page)
-		__free_page(iov_page);
+	free_page((unsigned long) iov_page);
 	while (num_pages)
 		__free_page(pages[--num_pages]);
 	kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f10..ae5744a2f9e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
 extern unsigned max_user_bgreq;
 extern unsigned max_user_congthresh;
 
+/* One forget request */
+struct fuse_forget_link {
+	struct fuse_forget_one forget_one;
+	struct fuse_forget_link *next;
+};
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
 	u64 nlookup;
 
 	/** The request used for sending the FORGET message */
-	struct fuse_req *forget_req;
+	struct fuse_forget_link *forget;
 
 	/** Time in jiffies until the file attributes are valid */
 	u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
 
 	/** Data for asynchronous requests */
 	union {
-		struct fuse_forget_in forget_in;
 		struct {
 			struct fuse_release_in in;
 			struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
 	/** Pending interrupts */
 	struct list_head interrupts;
 
+	/** Queue of pending forgets */
+	struct fuse_forget_link forget_list_head;
+	struct fuse_forget_link *forget_list_tail;
+
+	/** Batching of FORGET requests (positive indicates FORGET batch) */
+	int forget_batch;
+
 	/** Flag indicating if connection is blocked.  This will be
 	    the case before the INIT reply is received, and if there
 	    are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 /**
  * Send FORGET command
  */
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-		      u64 nodeid, u64 nlookup);
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+		       u64 nodeid, u64 nlookup);
+
+struct fuse_forget_link *fuse_alloc_forget(void);
 
 /**
  * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Send a request with no reply
- */
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
-
-/**
  * Send a request in the background
  */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8b31da19b9..9e3f68cc1bd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
 	unsigned blksize;
 };
 
+struct fuse_forget_link *fuse_alloc_forget()
+{
+	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
+
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
 	struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
 	init_waitqueue_head(&fi->page_waitq);
-	fi->forget_req = fuse_request_alloc();
-	if (!fi->forget_req) {
+	fi->forget = fuse_alloc_forget();
+	if (!fi->forget) {
 		kmem_cache_free(fuse_inode_cachep, inode);
 		return NULL;
 	}
@@ -111,24 +116,10 @@ static void fuse_destroy_inode(struct inode *inode)
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	BUG_ON(!list_empty(&fi->write_files));
 	BUG_ON(!list_empty(&fi->queued_writes));
-	if (fi->forget_req)
-		fuse_request_free(fi->forget_req);
+	kfree(fi->forget);
 	call_rcu(&inode->i_rcu, fuse_i_callback);
 }
 
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-		      u64 nodeid, u64 nlookup)
-{
-	struct fuse_forget_in *inarg = &req->misc.forget_in;
-	inarg->nlookup = nlookup;
-	req->in.h.opcode = FUSE_FORGET;
-	req->in.h.nodeid = nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct fuse_forget_in);
-	req->in.args[0].value = inarg;
-	fuse_request_send_noreply(fc, req);
-}
-
 static void fuse_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
@@ -136,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
 	if (inode->i_sb->s_flags & MS_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
-		fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
-		fi->forget_req = NULL;
+		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
+		fi->forget = NULL;
 	}
 }
 
@@ -541,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
 	INIT_LIST_HEAD(&fc->interrupts);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
+	fc->forget_list_tail = &fc->forget_list_head;
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
 	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -625,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
 		goto out_iput;
 
 	entry = d_obtain_alias(inode);
-	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
-		d_set_d_op(entry, &fuse_dentry_operations);
+	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
 		fuse_invalidate_entry_cache(entry);
-	}
 
 	return entry;
 
@@ -727,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
 	}
 
 	parent = d_obtain_alias(inode);
-	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
-		d_set_d_op(parent, &fuse_dentry_operations);
+	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
 		fuse_invalidate_entry_cache(parent);
-	}
 
 	return parent;
 }
@@ -997,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		iput(root);
 		goto err_put_conn;
 	}
+	/* only now - we want root dentry with NULL ->d_op */
+	sb->s_d_op = &fuse_dentry_operations;
 
 	init_req = fuse_request_alloc();
 	if (!init_req)
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 97012ecff56..9023db8184f 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,12 +126,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-	struct dentry *dentry;
-
-	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
-	if (!IS_ERR(dentry))
-		d_set_d_op(dentry, &gfs2_dops);
-	return dentry;
+	return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
 }
 
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
@@ -139,7 +134,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct inode *inode;
-	struct dentry *dentry;
 
 	inode = gfs2_ilookup(sb, inum->no_addr);
 	if (inode) {
@@ -156,10 +150,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 		return ERR_CAST(inode);
 
 out_inode:
-	dentry = d_obtain_alias(inode);
-	if (!IS_ERR(dentry))
-		d_set_d_op(dentry, &gfs2_dops);
-	return dentry;
+	return d_obtain_alias(inode);
 }
 
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e..7cfdcb91336 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = __block_write_begin(page, from, to - from, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = __block_write_begin(page, start, end - start,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 
 /**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= gfs2_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.setlease	= generic_setlease,
+	.fallocate	= gfs2_fallocate,
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8d3d2b4a0a7..a79790c0627 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780b..7aa7d4f8984 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
 }
 
 /**
- * GFS2 lookup code fills in vfs inode contents based on info obtained
- * from directory entry inside gfs2_inode_lookup(). This has caused issues
- * with NFS code path since its get_dentry routine doesn't have the relevant
- * directory entry when gfs2_inode_lookup() is invoked. Part of the code
- * segment inside gfs2_inode_lookup code needs to get moved around.
+ * gfs2_set_iop - Sets inode operations
+ * @inode: The inode with correct i_mode filled in
  *
- * Clears I_NEW as well.
- **/
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * from directory entry inside gfs2_inode_lookup().
+ */
 
-void gfs2_set_iop(struct inode *inode)
+static void gfs2_set_iop(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	umode_t mode = inode->i_mode;
@@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
 		inode->i_op = &gfs2_file_iops;
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 	}
-
-	unlock_new_inode(inode);
 }
 
 /**
@@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
  * Returns: A VFS inode, or an error
  */
 
-struct inode *gfs2_inode_lookup(struct super_block *sb,
-				unsigned int type,
-				u64 no_addr,
-				u64 no_formal_ino)
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
+				u64 no_addr, u64 no_formal_ino)
 {
 	struct inode *inode;
 	struct gfs2_inode *ip;
@@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
 		if (unlikely(error))
 			goto fail_iopen;
-		ip->i_iopen_gh.gh_gl->gl_object = ip;
 
+		ip->i_iopen_gh.gh_gl->gl_object = ip;
 		gfs2_glock_put(io_gl);
 		io_gl = NULL;
 
-		if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
-			goto gfs2_nfsbypass;
-
-		inode->i_mode = DT2IF(type);
-
-		/*
-		 * We must read the inode in order to work out its type in
-		 * this case. Note that this doesn't happen often as we normally
-		 * know the type beforehand. This code path only occurs during
-		 * unlinked inode recovery (where it is safe to do this glock,
-		 * which is not true in the general case).
-		 */
 		if (type == DT_UNKNOWN) {
-			struct gfs2_holder gh;
-			error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-			if (unlikely(error))
-				goto fail_glock;
-			/* Inode is now uptodate */
-			gfs2_glock_dq_uninit(&gh);
+			/* Inode glock must be locked already */
+			error = gfs2_inode_refresh(GFS2_I(inode));
+			if (error)
+				goto fail_refresh;
+		} else {
+			inode->i_mode = DT2IF(type);
 		}
 
 		gfs2_set_iop(inode);
+		unlock_new_inode(inode);
 	}
 
-gfs2_nfsbypass:
 	return inode;
-fail_glock:
-	gfs2_glock_dq(&ip->i_iopen_gh);
+
+fail_refresh:
+	ip->i_iopen_gh.gh_gl->gl_object = NULL;
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_iopen:
 	if (io_gl)
 		gfs2_glock_put(io_gl);
 fail_put:
-	if (inode->i_state & I_NEW)
-		ip->i_gl->gl_object = NULL;
+	ip->i_gl->gl_object = NULL;
 	gfs2_glock_put(ip->i_gl);
 fail:
-	if (inode->i_state & I_NEW)
-		iget_failed(inode);
-	else
-		iput(inode);
+	iget_failed(inode);
 	return ERR_PTR(error);
 }
 
@@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 	if (IS_ERR(inode))
 		goto fail;
 
-	error = gfs2_inode_refresh(GFS2_I(inode));
-	if (error)
-		goto fail_iput;
-
-	/* Pick up the works we bypass in gfs2_inode_lookup */
-	if (inode->i_state & I_NEW) 
-		gfs2_set_iop(inode);
-
 	/* Two extra checks for NFS only */
 	if (no_formal_ino) {
 		error = -ESTALE;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 732a183efdb..3e00a66e7cb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,7 +96,6 @@ err:
 	return -EIO;
 }
 
-extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
 				       u64 no_addr, u64 no_formal_ino);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2aeabd4218c..777927ce6f7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
 		iput(inode);
 		return -ENOMEM;
 	}
-	d_set_d_op(dentry, &gfs2_dops);
 	*dptr = dentry;
 	return 0;
 }
@@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
+	sb->s_d_op = &gfs2_dops;
 	sb->s_export_op = &gfs2_export_ops;
 	sb->s_xattr = gfs2_xattr_handlers;
 	sb->s_qcop = &gfs2_quotactl_ops;
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error;
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1501db4f0e6..d8b26ac2e20 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode = NULL;
 
-	d_set_d_op(dentry, &gfs2_dops);
-
 	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 	if (inode && IS_ERR(inode))
 		return ERR_CAST(inode);
@@ -1259,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
-static void empty_write_end(struct page *page, unsigned from,
-			   unsigned to)
-{
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-
-	page_zero_new_buffers(page, from, to);
-	flush_dcache_page(page);
-	mark_page_accessed(page);
-
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
-
-	block_commit_write(page, from, to);
-}
-
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
-	unsigned start, end, next;
-	struct buffer_head *bh, *head;
-	int error;
-
-	if (!page_has_buffers(page)) {
-		error = __block_write_begin(page, from, to - from, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-
-		empty_write_end(page, from, to);
-		return 0;
-	}
-
-	bh = head = page_buffers(page);
-	next = end = 0;
-	while (next < from) {
-		next += bh->b_size;
-		bh = bh->b_this_page;
-	}
-	start = next;
-	do {
-		next += bh->b_size;
-		if (buffer_mapped(bh)) {
-			if (end) {
-				error = __block_write_begin(page, start, end - start,
-							    gfs2_block_map);
-				if (unlikely(error))
-					return error;
-				empty_write_end(page, start, end);
-				end = 0;
-			}
-			start = next;
-		}
-		else
-			end = next;
-		bh = bh->b_this_page;
-	} while (next < to);
-
-	if (end) {
-		error = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(error))
-			return error;
-		empty_write_end(page, start, end);
-	}
-
-	return 0;
-}
-
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-			   int mode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *dibh;
-	int error;
-	u64 start = offset >> PAGE_CACHE_SHIFT;
-	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t curr;
-	struct page *page;
-	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-	unsigned int from, to;
-
-	if (!end_offset)
-		end_offset = PAGE_CACHE_SIZE;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(error))
-		goto out;
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
-	if (gfs2_is_stuffed(ip)) {
-		error = gfs2_unstuff_dinode(ip, NULL);
-		if (unlikely(error))
-			goto out;
-	}
-
-	curr = start;
-	offset = start << PAGE_CACHE_SHIFT;
-	from = start_offset;
-	to = PAGE_CACHE_SIZE;
-	while (curr <= end) {
-		page = grab_cache_page_write_begin(inode->i_mapping, curr,
-						   AOP_FLAG_NOFS);
-		if (unlikely(!page)) {
-			error = -ENOMEM;
-			goto out;
-		}
-
-		if (curr == end)
-			to = end_offset;
-		error = write_empty_blocks(page, from, to);
-		if (!error && offset + to > inode->i_size &&
-		    !(mode & FALLOC_FL_KEEP_SIZE)) {
-			i_size_write(inode, offset + to);
-		}
-		unlock_page(page);
-		page_cache_release(page);
-		if (error)
-			goto out;
-		curr++;
-		offset += PAGE_CACHE_SIZE;
-		from = 0;
-	}
-
-	gfs2_dinode_out(ip, dibh->b_data);
-	mark_inode_dirty(inode);
-
-	brelse(dibh);
-
-out:
-	return error;
-}
-
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
-			    unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
-	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-
-	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
-		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-		max_data -= tmp;
-	}
-	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-	   so it might end up with fewer data blocks */
-	if (max_data <= *data_blocks)
-		return;
-	*data_blocks = max_data;
-	*ind_blocks = max_blocks - max_data;
-	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
-	if (*len > max) {
-		*len = max;
-		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
-	}
-}
-
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
-			   loff_t len)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	loff_t bytes, max_bytes;
-	struct gfs2_alloc *al;
-	int error;
-	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
-	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-
-	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-		 sdp->sd_sb.sb_bsize_shift;
-
-	len = next - offset;
-	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
-	if (!bytes)
-		bytes = UINT_MAX;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-
-	if (!gfs2_write_alloc_required(ip, offset, len))
-		goto out_unlock;
-
-	while (len > 0) {
-		if (len < bytes)
-			bytes = len;
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_unlock;
-		}
-
-		error = gfs2_quota_lock_check(ip);
-		if (error)
-			goto out_alloc_put;
-
-retry:
-		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-
-		al->al_requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip);
-		if (error) {
-			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-				bytes >>= 1;
-				goto retry;
-			}
-			goto out_qunlock;
-		}
-		max_bytes = bytes;
-		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
-		al->al_requested = data_blocks + ind_blocks;
-
-		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + gfs2_rg_blocks(al);
-		if (gfs2_is_jdata(ip))
-			rblocks += data_blocks ? data_blocks : 1;
-
-		error = gfs2_trans_begin(sdp, rblocks,
-					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-		if (error)
-			goto out_trans_fail;
-
-		error = fallocate_chunk(inode, offset, max_bytes, mode);
-		gfs2_trans_end(sdp);
-
-		if (error)
-			goto out_trans_fail;
-
-		len -= max_bytes;
-		offset += max_bytes;
-		gfs2_inplace_release(ip);
-		gfs2_quota_unlock(ip);
-		gfs2_alloc_put(ip);
-	}
-	goto out_unlock;
-
-out_trans_fail:
-	gfs2_inplace_release(ip);
-out_qunlock:
-	gfs2_quota_unlock(ip);
-out_alloc_put:
-	gfs2_alloc_put(ip);
-out_unlock:
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
-
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1560,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
-	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 16c2ecac7eb..ec73ed70bae 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	if (error)
 		goto out_truncate;
 
+	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 	gfs2_glock_dq_wait(&ip->i_iopen_gh);
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
 	error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index ea4aefe7c65..afa66aaa223 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	int res;
 
-	d_set_d_op(dentry, &hfs_dentry_operations);
-
 	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
 	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
 	res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0bef62aa4f4..1b55f704fb2 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -429,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!root_inode)
 		goto bail_no_root;
 
+	sb->s_d_op = &hfs_dentry_operations;
 	res = -ENOMEM;
 	sb->s_root = d_alloc_root(root_inode);
 	if (!sb->s_root)
 		goto bail_iput;
 
-	d_set_d_op(sb->s_root, &hfs_dentry_operations);
-
 	/* everything's okay */
 	return 0;
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f896dc84302..4df5059c25d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
 
 	sb = dir->i_sb;
 
-	d_set_d_op(dentry, &hfsplus_dentry_operations);
 	dentry->d_fsdata = NULL;
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6ee6ad20acf..9a3b4795f43 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -444,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = PTR_ERR(root);
 		goto cleanup;
 	}
+	sb->s_d_op = &hfsplus_dentry_operations;
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
 		iput(root);
 		err = -ENOMEM;
 		goto cleanup;
 	}
-	d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
 	str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d3244d949a4..2638c834ed2 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -612,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
 		goto out_put;
 
 	d_add(dentry, inode);
-	d_set_d_op(dentry, &hostfs_dentry_ops);
 	return NULL;
 
  out_put:
@@ -922,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	sb->s_blocksize_bits = 10;
 	sb->s_magic = HOSTFS_SUPER_MAGIC;
 	sb->s_op = &hostfs_sbops;
+	sb->s_d_op = &hostfs_dentry_ops;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
 	/* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 32c13a94e1e..05d4816e4e7 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -58,12 +58,7 @@ static int hpfs_compare_dentry(const struct dentry *parent,
 	return 0;
 }
 
-static const struct dentry_operations hpfs_dentry_operations = {
+const struct dentry_operations hpfs_dentry_operations = {
 	.d_hash		= hpfs_hash_dentry,
 	.d_compare	= hpfs_compare_dentry,
 };
-
-void hpfs_set_dentry_operations(struct dentry *dentry)
-{
-	d_set_d_op(dentry, &hpfs_dentry_operations);
-}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceb..d32f63a569f 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 
 	end:
 	end_add:
-	hpfs_set_dentry_operations(dentry);
 	unlock_kernel();
 	d_add(dentry, result);
 	return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2fee17d0d9a..1c43dbea55e 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
 
 /* dentry.c */
 
-void hpfs_set_dentry_operations(struct dentry *);
+extern const struct dentry_operations hpfs_dentry_operations;
 
 /* dir.c */
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd1..1ae35baa539 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_size != i_size_read(inode)) {
 		error = vmtruncate(inode, attr->ia_size);
 		if (error)
-			return error;
+			goto out_unlock;
 	}
 
 	setattr_copy(inode, attr);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 49935ba78db..b30426b1fc9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -550,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	/* Fill superblock stuff */
 	s->s_magic = HPFS_SUPER_MAGIC;
 	s->s_op = &hpfs_sops;
+	s->s_d_op = &hpfs_dentry_operations;
 
 	sbi->sb_root = superblock->root;
 	sbi->sb_fs_size = superblock->n_sectors;
@@ -651,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 		iput(root);
 		goto bail0;
 	}
-	hpfs_set_dentry_operations(s->s_root);
 
 	/*
 	 * find the root directory's . pointer & finish filling in the inode
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee273..0663568b124 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,10 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
+
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
 
 extern void __init mnt_init(void);
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc1647662..a59635e295f 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
 			    u64 phys, u64 len, u32 flags)
 {
 	struct fiemap_extent extent;
-	struct fiemap_extent *dest = fieinfo->fi_extents_start;
+	struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
 
 	/* only count the extents */
 	if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
 static int ioctl_fiemap(struct file *filp, unsigned long arg)
 {
 	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
 	struct fiemap_extent_info fieinfo = { 0, };
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	if (!inode->i_op->fiemap)
 		return -EOPNOTSUPP;
 
-	if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
-			   sizeof(struct fiemap)))
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
 		return -EFAULT;
 
 	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 
 	fieinfo.fi_flags = fiemap.fm_flags;
 	fieinfo.fi_extents_max = fiemap.fm_extent_count;
-	fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
 
 	if (fiemap.fm_extent_count != 0 &&
 	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
 	fiemap.fm_flags = fieinfo.fi_flags;
 	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-	if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
 		error = -EFAULT;
 
 	return error;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 844a7903c72..a0f3833c0db 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,17 +939,18 @@ root_found:
 		goto out_iput;
 	}
 
-	/* get the root dentry */
-	s->s_root = d_alloc_root(inode);
-	if (!(s->s_root))
-		goto out_no_root;
-
 	table = 0;
 	if (joliet_level)
 		table += 2;
 	if (opt.check == 'r')
 		table++;
-	d_set_d_op(s->s_root, &isofs_dentry_ops[table]);
+
+	s->s_d_op = &isofs_dentry_ops[table];
+
+	/* get the root dentry */
+	s->s_root = d_alloc_root(inode);
+	if (!(s->s_root))
+		goto out_no_root;
 
 	kfree(opt.iocharset);
 
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 679a849c3b2..4fb3e8074fd 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -172,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	struct inode *inode;
 	struct page *page;
 
-	d_set_d_op(dentry, dir->i_sb->s_root->d_op);
-
 	page = alloc_page(GFP_USER);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 846a3f31411..5b2e4c30a2a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
 	 * the committing transaction.  Really, we only need to give it
 	 * committing_transaction->t_outstanding_credits plus "enough" for
 	 * the log control blocks.
-	 * Also, this test is inconsitent with the matching one in
+	 * Also, this test is inconsistent with the matching one in
 	 * journal_extend().
 	 */
 	if (__log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f837ba95352..9e4686900f1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
+#include <linux/ratelimit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
 
 	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
 	if (!journal)
-		goto fail;
+		return NULL;
 
 	init_waitqueue_head(&journal->j_wait_transaction_locked);
 	init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
 	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
 	if (err) {
 		kfree(journal);
-		goto fail;
+		return NULL;
 	}
 
 	spin_lock_init(&journal->j_history_lock);
 
 	return journal;
-fail:
-	return NULL;
 }
 
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
 	struct journal_head *ret;
-	static unsigned long last_warning;
 
 #ifdef CONFIG_JBD2_DEBUG
 	atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 	ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
 	if (!ret) {
 		jbd_debug(1, "out of memory for journal_head\n");
-		if (time_after(jiffies, last_warning + 5*HZ)) {
-			printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-			       __func__);
-			last_warning = jiffies;
-		}
+		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
 		while (!ret) {
 			yield();
 			ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 
 #endif
 
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 
 static int __init journal_init_handle_cache(void)
 {
-	jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
-				sizeof(handle_t),
-				0,		/* offset */
-				SLAB_TEMPORARY,	/* flags */
-				NULL);		/* ctor */
+	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
 	if (jbd2_handle_cache == NULL) {
-		printk(KERN_EMERG "JBD: failed to create handle cache\n");
+		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+	if (jbd2_inode_cache == NULL) {
+		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+		kmem_cache_destroy(jbd2_handle_cache);
 		return -ENOMEM;
 	}
 	return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
 	if (jbd2_handle_cache)
 		kmem_cache_destroy(jbd2_handle_cache);
+	if (jbd2_inode_cache)
+		kmem_cache_destroy(jbd2_inode_cache);
+
 }
 
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f..1cad869494f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction - 
 			be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
 		jbd_debug(1,
 			  "JBD: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
+#endif
 		journal->j_transaction_sequence = ++info.end_transaction;
 	}
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6bf0a242613..faad2bd787c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -251,7 +251,7 @@ repeat:
 	 * the committing transaction.  Really, we only need to give it
 	 * committing_transaction->t_outstanding_credits plus "enough" for
 	 * the log control blocks.
-	 * Also, this test is inconsitent with the matching one in
+	 * Also, this test is inconsistent with the matching one in
 	 * jbd2_journal_extend().
 	 */
 	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 		jbd2_free_handle(handle);
 		current->journal_info = NULL;
 		handle = ERR_PTR(err);
-		goto out;
 	}
-out:
 	return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	transaction = handle->h_transaction;
 	journal = transaction->t_journal;
 
-	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 
 	JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -774,7 +772,7 @@ done:
 		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
 			    "Possible IO failure.\n");
 		page = jh2bh(jh)->b_page;
-		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+		offset = offset_in_page(jh2bh(jh)->b_data);
 		source = kmap_atomic(page, KM_USER0);
 		/* Fire data frozen trigger just before we copy the data */
 		jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02..3005ec4520a 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
 	size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
 #ifndef __ECOS
 	if (jffs2_blocks_use_vmalloc(c))
-		c->blocks = vmalloc(size);
+		c->blocks = vzalloc(size);
 	else
 #endif
-		c->blocks = kmalloc(size, GFP_KERNEL);
+		c->blocks = kzalloc(size, GFP_KERNEL);
 	if (!c->blocks)
 		return -ENOMEM;
 
-	memset(c->blocks, 0, size);
 	for (i=0; i<c->nr_blocks; i++) {
 		INIT_LIST_HEAD(&c->blocks[i].list);
 		c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64..0bc6a6c80a5 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
 	void *os_priv;
 };
 
-#endif /* _JFFS2_FB_SB */
+#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a4..4f9cc048294 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
 			    offset, je32_to_cpu(rx.hdr_crc), crc);
 		xd->flags |= JFFS2_XFLAGS_INVALID;
-		return EIO;
+		return -EIO;
 	}
 	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
 	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 			    je32_to_cpu(rx.xid), xd->xid,
 			    je32_to_cpu(rx.version), xd->version);
 		xd->flags |= JFFS2_XFLAGS_INVALID;
-		return EIO;
+		return -EIO;
 	}
 	xd->xprefix = rx.xprefix;
 	xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 			      ref_offset(xd->node), xd->data_crc, crc);
 		kfree(data);
 		xd->flags |= JFFS2_XFLAGS_INVALID;
-		return EIO;
+		return -EIO;
 	}
 
 	xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	if (xd->xname)
 		return 0;
 	if (xd->flags & JFFS2_XFLAGS_INVALID)
-		return EIO;
+		return -EIO;
 	if (unlikely(is_xattr_datum_unchecked(c, xd)))
 		rc = do_verify_xattr_datum(c, xd);
 	if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	if (crc != je32_to_cpu(rr.node_crc)) {
 		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
 			    offset, je32_to_cpu(rr.node_crc), crc);
-		return EIO;
+		return -EIO;
 	}
 	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
 	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 			    offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
 			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
-		return EIO;
+		return -EIO;
 	}
 	ref->ino = je32_to_cpu(rr.ino);
 	ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aa..278e3fb40b7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+	bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
 	}
 
-	if ((rc = bd_claim(bdev, log))) {
-		goto close;
-	}
-
 	log->bdev = bdev;
 	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
 
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
 	 * initialize log:
 	 */
 	if ((rc = lmLogInit(log)))
-		goto unclaim;
+		goto close;
 
 	list_add(&log->journal_list, &jfs_external_logs);
 
@@ -1163,11 +1160,8 @@ journal_found:
 	list_del(&log->journal_list);
 	lbmLogShutdown(log);
 
-      unclaim:
-	bd_release(bdev);
-
       close:		/* close external log device */
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
 	bdev = log->bdev;
 	rc = lmLogShutdown(log);
 
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
 	kfree(log);
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4414e3a4226..81ead850ddb 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1465,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
 
 	jfs_info("jfs_lookup: name = %s", name);
 
-	if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-		d_set_d_op(dentry, &jfs_ci_dentry_operations);
-
 	if ((name[0] == '.') && (len == 1))
 		inum = dip->i_ino;
 	else if (strcmp(name, "..") == 0)
@@ -1492,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
 		return ERR_CAST(ip);
 	}
 
-	dentry = d_splice_alias(ip, dentry);
-
-	if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-		d_set_d_op(dentry, &jfs_ci_dentry_operations);
-
-	return dentry;
+	return d_splice_alias(ip, dentry);
 }
 
 static struct inode *jfs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3150d766e0d..eeca48a031a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -515,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = JFS_SUPER_MAGIC;
 
+	if (sbi->mntflag & JFS_OS2)
+		sb->s_d_op = &jfs_ci_dentry_operations;
+
 	inode = jfs_iget(sb, ROOT_I);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
@@ -524,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root)
 		goto out_no_root;
 
-	if (sbi->mntflag & JFS_OS2)
-		d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
-
 	/* logical blocks are represented by 40 bits in pxd_t, etc. */
 	sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
 #if BITS_PER_LONG == 32
diff --git a/fs/libfs.c b/fs/libfs.c
index 889311e3d06..c88eab55aec 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,8 @@ static const struct super_operations simple_super_operations = {
  * will never be mountable)
  */
 struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-	const struct super_operations *ops, unsigned long magic)
+	const struct super_operations *ops,
+	const struct dentry_operations *dops, unsigned long magic)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	struct dentry *dentry;
@@ -254,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
 	dentry->d_parent = dentry;
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
+	s->s_d_op = dops;
 	s->s_flags |= MS_ACTIVE;
 	return dget(s->s_root);
 
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab33..ca58d64374c 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_LOCKD) += lockd.o
 
-lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
-	        svcproc.o svcsubs.o mon.o xdr.o grace.o
-lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
+	        svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs		      := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 00000000000..f848b52c67b
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz		(0)
+#define NLM4_cookie_sz		(1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz		(1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz		(5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz		(6+NLM4_owner_sz)
+
+#define NLM4_testargs_sz	(NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz	(NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz	(NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz	(NLM4_cookie_sz+NLM4_lock_sz)
+
+#define NLM4_testres_sz		(NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz		(NLM4_cookie_sz+1)
+#define NLM4_norep_sz		(0)
+
+
+static s64 loff_t_to_s64(loff_t offset)
+{
+	s64 res;
+
+	if (offset >= NLM4_OFFSET_MAX)
+		res = NLM4_OFFSET_MAX;
+	else if (offset <= -NLM4_OFFSET_MAX)
+		res = -NLM4_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+				 u64 *l_offset, u64 *l_len)
+{
+	const struct file_lock *fl = &lock->fl;
+
+	BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+	BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+				fl->fl_end != OFFSET_MAX);
+
+	*l_offset = loff_t_to_s64(fl->fl_start);
+	if (fl->fl_end == OFFSET_MAX)
+		*l_len = 0;
+	else
+		*l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = value ? xdr_one : xdr_zero;
+}
+
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ *	typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+			  const u8 *data, const unsigned int length)
+{
+	__be32 *p;
+
+	BUG_ON(length > XDR_MAX_NETOBJ);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, data, length);
+}
+
+static int decode_netobj(struct xdr_stream *xdr,
+			 struct xdr_netobj *obj)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > XDR_MAX_NETOBJ))
+		goto out_size;
+	obj->len = length;
+	obj->data = (u8 *)p;
+	return 0;
+out_size:
+	dprintk("NFS: returned netobj was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+			  const struct nlm_cookie *cookie)
+{
+	BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+
+static int decode_cookie(struct xdr_stream *xdr,
+			     struct nlm_cookie *cookie)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	/* apparently HPUX can return empty cookies */
+	if (length == 0)
+		goto out_hpux;
+	if (length > NLM_MAXCOOKIELEN)
+		goto out_size;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	cookie->len = length;
+	memcpy(cookie->data, p, length);
+	return 0;
+out_hpux:
+	cookie->len = 4;
+	memset(cookie->data, 0, 4);
+	return 0;
+out_size:
+	dprintk("NFS: returned cookie was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	BUG_ON(fh->size > NFS3_FHSIZE);
+	encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+
+/*
+ *	enum nlm4_stats {
+ *		NLM4_GRANTED = 0,
+ *		NLM4_DENIED = 1,
+ *		NLM4_DENIED_NOLOCKS = 2,
+ *		NLM4_BLOCKED = 3,
+ *		NLM4_DENIED_GRACE_PERIOD = 4,
+ *		NLM4_DEADLCK = 5,
+ *		NLM4_ROFS = 6,
+ *		NLM4_STALE_FH = 7,
+ *		NLM4_FBIG = 8,
+ *		NLM4_FAILED = 9
+ *	};
+ *
+ *	struct nlm4_stat {
+ *		nlm4_stats stat;
+ *	};
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+			     const __be32 stat)
+{
+	__be32 *p;
+
+	BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+	p = xdr_reserve_space(xdr, 4);
+	*p = stat;
+}
+
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (unlikely(*p > nlm4_failed))
+		goto out_bad_xdr;
+	*stat = *p;
+	return 0;
+out_bad_xdr:
+	dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+			__func__, be32_to_cpup(p));
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	struct nlm4_holder {
+ *		bool	exclusive;
+ *		int32	svid;
+ *		netobj	oh;
+ *		uint64	l_offset;
+ *		uint64	l_len;
+ *	};
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+			       const struct nlm_res *result)
+{
+	const struct nlm_lock *lock = &result->lock;
+	u64 l_offset, l_len;
+	__be32 *p;
+
+	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_int32(xdr, lock->svid);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	nlm4_compute_offsets(lock, &l_offset, &l_len);
+	p = xdr_encode_hyper(p, l_offset);
+	xdr_encode_hyper(p, l_len);
+}
+
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+	struct nlm_lock *lock = &result->lock;
+	struct file_lock *fl = &lock->fl;
+	u64 l_offset, l_len;
+	u32 exclusive;
+	int error;
+	__be32 *p;
+	s32 end;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(fl);
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	exclusive = be32_to_cpup(p++);
+	lock->svid = be32_to_cpup(p);
+	fl->fl_pid = (pid_t)lock->svid;
+
+	error = decode_netobj(xdr, &lock->oh);
+	if (unlikely(error))
+		goto out;
+
+	p = xdr_inline_decode(xdr, 8 + 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	p = xdr_decode_hyper(p, &l_offset);
+	xdr_decode_hyper(p, &l_len);
+	end = l_offset + l_len - 1;
+
+	fl->fl_start = (loff_t)l_offset;
+	if (l_len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = (loff_t)end;
+	error = 0;
+out:
+	return error;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+	/* NB: client-side does not set lock->len */
+	u32 length = strlen(name);
+	__be32 *p;
+
+	BUG_ON(length > NLM_MAXSTRLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ *	struct nlm4_lock {
+ *		string	caller_name<LM_MAXSTRLEN>;
+ *		netobj	fh;
+ *		netobj	oh;
+ *		int32	svid;
+ *		uint64	l_offset;
+ *		uint64	l_len;
+ *	};
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+			     const struct nlm_lock *lock)
+{
+	u64 l_offset, l_len;
+	__be32 *p;
+
+	encode_caller_name(xdr, lock->caller);
+	encode_fh(xdr, &lock->fh);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 8 + 8);
+	*p++ = cpu_to_be32(lock->svid);
+
+	nlm4_compute_offsets(lock, &l_offset, &l_len);
+	p = xdr_encode_hyper(p, l_offset);
+	xdr_encode_hyper(p, l_len);
+}
+
+
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	struct nlm4_testargs {
+ *		netobj cookie;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_lockargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *		bool reclaim;
+ *		int state;
+ *	};
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
+}
+
+/*
+ *	struct nlm4_cancargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_unlockargs {
+ *		netobj cookie;
+ *		struct nlm4_lock alock;
+ *	};
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm4_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm4_res {
+ *		netobj cookie;
+ *		nlm4_stat stat;
+ *	};
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+			     struct xdr_stream *xdr,
+			     const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
+}
+
+/*
+ *	union nlm4_testrply switch (nlm4_stats stat) {
+ *	case NLM4_DENIED:
+ *		struct nlm4_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm4_testres {
+ *		netobj cookie;
+ *		nlm4_testrply test_stat;
+ *	};
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm4_stat(xdr, result->status);
+	if (result->status == nlm_lck_denied)
+		encode_nlm4_holder(xdr, result);
+}
+
+
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	union nlm4_testrply switch (nlm4_stats stat) {
+ *	case NLM4_DENIED:
+ *		struct nlm4_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm4_testres {
+ *		netobj cookie;
+ *		nlm4_testrply test_stat;
+ *	};
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+				struct nlm_res *result)
+{
+	int error;
+
+	error = decode_nlm4_stat(xdr, &result->status);
+	if (unlikely(error))
+		goto out;
+	if (result->status == nlm_lck_denied)
+		error = decode_nlm4_holder(xdr, result);
+out:
+	return error;
+}
+
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm4_testrply(xdr, result);
+out:
+	return error;
+}
+
+/*
+ *	struct nlm4_res {
+ *		netobj cookie;
+ *		nlm4_stat stat;
+ *	};
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm4_stat(xdr, &result->status);
+out:
+	return error;
+}
+
+
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep	NULL
+
+#define PROC(proc, argtype, restype)					\
+[NLMPROC_##proc] = {							\
+	.p_proc      = NLMPROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,		\
+	.p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,		\
+	.p_arglen    = NLM4_##argtype##_sz,				\
+	.p_replen    = NLM4_##restype##_sz,				\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+static struct rpc_procinfo	nlm4_procedures[] = {
+	PROC(TEST,		testargs,	testres),
+	PROC(LOCK,		lockargs,	res),
+	PROC(CANCEL,		cancargs,	res),
+	PROC(UNLOCK,		unlockargs,	res),
+	PROC(GRANTED,		testargs,	res),
+	PROC(TEST_MSG,		testargs,	norep),
+	PROC(LOCK_MSG,		lockargs,	norep),
+	PROC(CANCEL_MSG,	cancargs,	norep),
+	PROC(UNLOCK_MSG,	unlockargs,	norep),
+	PROC(GRANTED_MSG,	testargs,	norep),
+	PROC(TEST_RES,		testres,	norep),
+	PROC(LOCK_RES,		res,		norep),
+	PROC(CANCEL_RES,	res,		norep),
+	PROC(UNLOCK_RES,	res,		norep),
+	PROC(GRANTED_RES,	res,		norep),
+};
+
+struct rpc_version	nlm_version4 = {
+	.number		= 4,
+	.nrprocs	= ARRAY_SIZE(nlm4_procedures),
+	.procs		= nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 25509eb28fd..8d4ea8351e3 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
  */
 void nlmclnt_done(struct nlm_host *host)
 {
-	nlm_release_host(host);
+	nlmclnt_release_host(host);
 	lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -273,7 +273,7 @@ restart:
 	spin_unlock(&nlm_blocked_lock);
 
 	/* Release host handle after use */
-	nlm_release_host(host);
+	nlmclnt_release_host(host);
 	lockd_down();
 	return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 332c54cf75e..adb45ec9038 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
 		return;
 	list_del(&lockowner->list);
 	spin_unlock(&lockowner->host->h_lock);
-	nlm_release_host(lockowner->host);
+	nlmclnt_release_host(lockowner->host);
 	kfree(lockowner);
 }
 
@@ -207,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 		printk("nlm_alloc_call: failed, waiting for memory\n");
 		schedule_timeout_interruptible(5*HZ);
 	}
-	nlm_release_host(host);
+	nlmclnt_release_host(host);
 	return NULL;
 }
 
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
 	if (!atomic_dec_and_test(&call->a_count))
 		return;
-	nlm_release_host(call->a_host);
+	nlmclnt_release_host(call->a_host);
 	nlmclnt_release_lockargs(call);
 	kfree(call);
 }
 
 static void nlmclnt_rpc_release(void *data)
 {
-	nlm_release_call(data);
+	nlmclnt_release_call(data);
 }
 
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 			status = nlm_stat_to_errno(req->a_res.status);
 	}
 out:
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 }
 
@@ -593,7 +593,7 @@ again:
 out_unblock:
 	nlmclnt_finish_block(block);
 out:
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 out_unlock:
 	/* Fatal error: ensure that we remove the lock altogether */
@@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	/* What to do now? I'm out of my depth... */
 	status = -ENOLCK;
 out:
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 }
 
@@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 			NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status == 0 && req->a_res.status == nlm_lck_denied)
 		status = -ENOLCK;
-	nlm_release_call(req);
+	nlmclnt_release_call(req);
 	return status;
 }
 
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 00000000000..180ac34feb9
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+
+#define NLMDBG_FACILITY		NLMDBG_XDR
+
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz		(1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz		(1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz		(1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz		(3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz		(4+NLM_owner_sz)
+
+#define NLM_testargs_sz		(NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz		(NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz		(NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz	(NLM_cookie_sz+NLM_lock_sz)
+
+#define NLM_testres_sz		(NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz		(NLM_cookie_sz+1)
+#define NLM_norep_sz		(0)
+
+
+static s32 loff_t_to_s32(loff_t offset)
+{
+	s32 res;
+
+	if (offset >= NLM_OFFSET_MAX)
+		res = NLM_OFFSET_MAX;
+	else if (offset <= -NLM_OFFSET_MAX)
+		res = -NLM_OFFSET_MAX;
+	else
+		res = offset;
+	return res;
+}
+
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+				u32 *l_offset, u32 *l_len)
+{
+	const struct file_lock *fl = &lock->fl;
+
+	BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+	BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+				fl->fl_end != OFFSET_MAX);
+
+	*l_offset = loff_t_to_s32(fl->fl_start);
+	if (fl->fl_end == OFFSET_MAX)
+		*l_len = 0;
+	else
+		*l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = value ? xdr_one : xdr_zero;
+}
+
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
+}
+
+/*
+ *	typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+			  const u8 *data, const unsigned int length)
+{
+	__be32 *p;
+
+	BUG_ON(length > XDR_MAX_NETOBJ);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, data, length);
+}
+
+static int decode_netobj(struct xdr_stream *xdr,
+			 struct xdr_netobj *obj)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > XDR_MAX_NETOBJ))
+		goto out_size;
+	obj->len = length;
+	obj->data = (u8 *)p;
+	return 0;
+out_size:
+	dprintk("NFS: returned netobj was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+			  const struct nlm_cookie *cookie)
+{
+	BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+	encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+
+static int decode_cookie(struct xdr_stream *xdr,
+			 struct nlm_cookie *cookie)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	/* apparently HPUX can return empty cookies */
+	if (length == 0)
+		goto out_hpux;
+	if (length > NLM_MAXCOOKIELEN)
+		goto out_size;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	cookie->len = length;
+	memcpy(cookie->data, p, length);
+	return 0;
+out_hpux:
+	cookie->len = 4;
+	memset(cookie->data, 0, 4);
+	return 0;
+out_size:
+	dprintk("NFS: returned cookie was too long: %u\n", length);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	BUG_ON(fh->size != NFS2_FHSIZE);
+	encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+
+/*
+ *	enum nlm_stats {
+ *		LCK_GRANTED = 0,
+ *		LCK_DENIED = 1,
+ *		LCK_DENIED_NOLOCKS = 2,
+ *		LCK_BLOCKED = 3,
+ *		LCK_DENIED_GRACE_PERIOD = 4
+ *	};
+ *
+ *
+ *	struct nlm_stat {
+ *		nlm_stats stat;
+ *	};
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+
+static void encode_nlm_stat(struct xdr_stream *xdr,
+			    const __be32 stat)
+{
+	__be32 *p;
+
+	BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+	p = xdr_reserve_space(xdr, 4);
+	*p = stat;
+}
+
+static int decode_nlm_stat(struct xdr_stream *xdr,
+			   __be32 *stat)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (unlikely(*p > nlm_lck_denied_grace_period))
+		goto out_enum;
+	*stat = *p;
+	return 0;
+out_enum:
+	dprintk("%s: server returned invalid nlm_stats value: %u\n",
+		__func__, be32_to_cpup(p));
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	struct nlm_holder {
+ *		bool exclusive;
+ *		int uppid;
+ *		netobj oh;
+ *		unsigned l_offset;
+ *		unsigned l_len;
+ *	};
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+			      const struct nlm_res *result)
+{
+	const struct nlm_lock *lock = &result->lock;
+	u32 l_offset, l_len;
+	__be32 *p;
+
+	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_int32(xdr, lock->svid);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4);
+	nlm_compute_offsets(lock, &l_offset, &l_len);
+	*p++ = cpu_to_be32(l_offset);
+	*p   = cpu_to_be32(l_len);
+}
+
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+	struct nlm_lock *lock = &result->lock;
+	struct file_lock *fl = &lock->fl;
+	u32 exclusive, l_offset, l_len;
+	int error;
+	__be32 *p;
+	s32 end;
+
+	memset(lock, 0, sizeof(*lock));
+	locks_init_lock(fl);
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	exclusive = be32_to_cpup(p++);
+	lock->svid = be32_to_cpup(p);
+	fl->fl_pid = (pid_t)lock->svid;
+
+	error = decode_netobj(xdr, &lock->oh);
+	if (unlikely(error))
+		goto out;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	fl->fl_flags = FL_POSIX;
+	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	l_offset = be32_to_cpup(p++);
+	l_len = be32_to_cpup(p);
+	end = l_offset + l_len - 1;
+
+	fl->fl_start = (loff_t)l_offset;
+	if (l_len == 0 || end < 0)
+		fl->fl_end = OFFSET_MAX;
+	else
+		fl->fl_end = (loff_t)end;
+	error = 0;
+out:
+	return error;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+	/* NB: client-side does not set lock->len */
+	u32 length = strlen(name);
+	__be32 *p;
+
+	BUG_ON(length > NLM_MAXSTRLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+/*
+ *	struct nlm_lock {
+ *		string caller_name<LM_MAXSTRLEN>;
+ *		netobj fh;
+ *		netobj oh;
+ *		int uppid;
+ *		unsigned l_offset;
+ *		unsigned l_len;
+ *	};
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+			    const struct nlm_lock *lock)
+{
+	u32 l_offset, l_len;
+	__be32 *p;
+
+	encode_caller_name(xdr, lock->caller);
+	encode_fh(xdr, &lock->fh);
+	encode_netobj(xdr, lock->oh.data, lock->oh.len);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(lock->svid);
+
+	nlm_compute_offsets(lock, &l_offset, &l_len);
+	*p++ = cpu_to_be32(l_offset);
+	*p   = cpu_to_be32(l_len);
+}
+
+
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	struct nlm_testargs {
+ *		netobj cookie;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_lockargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *		bool reclaim;
+ *		int state;
+ *	};
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+	encode_bool(xdr, args->reclaim);
+	encode_int32(xdr, args->state);
+}
+
+/*
+ *	struct nlm_cancargs {
+ *		netobj cookie;
+ *		bool block;
+ *		bool exclusive;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_bool(xdr, args->block);
+	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_unlockargs {
+ *		netobj cookie;
+ *		struct nlm_lock alock;
+ *	};
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nlm_args *args)
+{
+	const struct nlm_lock *lock = &args->lock;
+
+	encode_cookie(xdr, &args->cookie);
+	encode_nlm_lock(xdr, lock);
+}
+
+/*
+ *	struct nlm_res {
+ *		netobj cookie;
+ *		nlm_stat stat;
+ *	};
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+			    struct xdr_stream *xdr,
+			    const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+}
+
+/*
+ *	union nlm_testrply switch (nlm_stats stat) {
+ *	case LCK_DENIED:
+ *		struct nlm_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm_testres {
+ *		netobj cookie;
+ *		nlm_testrply test_stat;
+ *	};
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+				const struct nlm_res *result)
+{
+	if (result->status == nlm_lck_denied)
+		encode_nlm_holder(xdr, result);
+}
+
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				const struct nlm_res *result)
+{
+	encode_cookie(xdr, &result->cookie);
+	encode_nlm_stat(xdr, result->status);
+	encode_nlm_testrply(xdr, result);
+}
+
+
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+
+/*
+ *	union nlm_testrply switch (nlm_stats stat) {
+ *	case LCK_DENIED:
+ *		struct nlm_holder holder;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct nlm_testres {
+ *		netobj cookie;
+ *		nlm_testrply test_stat;
+ *	};
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+			       struct nlm_res *result)
+{
+	int error;
+
+	error = decode_nlm_stat(xdr, &result->status);
+	if (unlikely(error))
+		goto out;
+	if (result->status == nlm_lck_denied)
+		error = decode_nlm_holder(xdr, result);
+out:
+	return error;
+}
+
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+			       struct xdr_stream *xdr,
+			       struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm_testrply(xdr, result);
+out:
+	return error;
+}
+
+/*
+ *	struct nlm_res {
+ *		netobj cookie;
+ *		nlm_stat stat;
+ *	};
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+			   struct xdr_stream *xdr,
+			   struct nlm_res *result)
+{
+	int error;
+
+	error = decode_cookie(xdr, &result->cookie);
+	if (unlikely(error))
+		goto out;
+	error = decode_nlm_stat(xdr, &result->status);
+out:
+	return error;
+}
+
+
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep	NULL
+
+#define PROC(proc, argtype, restype)	\
+[NLMPROC_##proc] = {							\
+	.p_proc      = NLMPROC_##proc,					\
+	.p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,		\
+	.p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,		\
+	.p_arglen    = NLM_##argtype##_sz,				\
+	.p_replen    = NLM_##restype##_sz,				\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
+	}
+
+static struct rpc_procinfo	nlm_procedures[] = {
+	PROC(TEST,		testargs,	testres),
+	PROC(LOCK,		lockargs,	res),
+	PROC(CANCEL,		cancargs,	res),
+	PROC(UNLOCK,		unlockargs,	res),
+	PROC(GRANTED,		testargs,	res),
+	PROC(TEST_MSG,		testargs,	norep),
+	PROC(LOCK_MSG,		lockargs,	norep),
+	PROC(CANCEL_MSG,	cancargs,	norep),
+	PROC(UNLOCK_MSG,	unlockargs,	norep),
+	PROC(GRANTED_MSG,	testargs,	norep),
+	PROC(TEST_RES,		testres,	norep),
+	PROC(LOCK_RES,		res,		norep),
+	PROC(CANCEL_RES,	res,		norep),
+	PROC(UNLOCK_RES,	res,		norep),
+	PROC(GRANTED_RES,	res,		norep),
+};
+
+static struct rpc_version	nlm_version1 = {
+		.number		= 1,
+		.nrprocs	= ARRAY_SIZE(nlm_procedures),
+		.procs		= nlm_procedures,
+};
+
+static struct rpc_version	nlm_version3 = {
+		.number		= 3,
+		.nrprocs	= ARRAY_SIZE(nlm_procedures),
+		.procs		= nlm_procedures,
+};
+
+static struct rpc_version	*nlm_versions[] = {
+	[1] = &nlm_version1,
+	[3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+	[4] = &nlm_version4,
+#endif
+};
+
+static struct rpc_stat		nlm_rpc_stats;
+
+struct rpc_program		nlm_program = {
+		.name		= "lockd",
+		.number		= NLM_PROGRAM,
+		.nrvers		= ARRAY_SIZE(nlm_versions),
+		.version	= nlm_versions,
+		.stats		= &nlm_rpc_stats,
+};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ed0c59fe23c..5f1bcb2f06f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
 #define NLM_HOST_EXPIRE		(300 * HZ)
 #define NLM_HOST_COLLECT	(120 * HZ)
 
-static struct hlist_head	nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head	nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head	nlm_client_hosts[NLM_HOST_NRHASH];
+
+#define for_each_host(host, pos, chain, table) \
+	for ((chain) = (table); \
+	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+		hlist_for_each_entry((host), (pos), (chain), h_hash)
+
+#define for_each_host_safe(host, pos, next, chain, table) \
+	for ((chain) = (table); \
+	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+		hlist_for_each_entry_safe((host), (pos), (next), \
+						(chain), h_hash)
+
 static unsigned long		next_gc;
-static int			nrhosts;
+static unsigned long		nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
 static void			nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
 	const u32		version;	/* NLM version to search for */
 	const char		*hostname;	/* remote's hostname */
 	const size_t		hostname_len;	/* it's length */
-	const struct sockaddr	*src_sap;	/* our address (optional) */
-	const size_t		src_len;	/* it's length */
 	const int		noresvport;	/* use non-priv port */
 };
 
@@ -88,127 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 }
 
 /*
- * Common host lookup routine for server & client
+ * Allocate and initialize an nlm_host.  Common to both client and server.
  */
-static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+				       struct nsm_handle *nsm)
 {
-	struct hlist_head *chain;
-	struct hlist_node *pos;
-	struct nlm_host	*host;
-	struct nsm_handle *nsm = NULL;
-
-	mutex_lock(&nlm_host_mutex);
+	struct nlm_host *host = NULL;
+	unsigned long now = jiffies;
 
-	if (time_after_eq(jiffies, next_gc))
-		nlm_gc_hosts();
-
-	/* We may keep several nlm_host objects for a peer, because each
-	 * nlm_host is identified by
-	 * (address, protocol, version, server/client)
-	 * We could probably simplify this a little by putting all those
-	 * different NLM rpc_clients into one single nlm_host object.
-	 * This would allow us to have one nlm_host per address.
-	 */
-	chain = &nlm_hosts[nlm_hash_address(ni->sap)];
-	hlist_for_each_entry(host, pos, chain, h_hash) {
-		if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
-			continue;
-
-		/* See if we have an NSM handle for this client */
-		if (!nsm)
-			nsm = host->h_nsmhandle;
-
-		if (host->h_proto != ni->protocol)
-			continue;
-		if (host->h_version != ni->version)
-			continue;
-		if (host->h_server != ni->server)
-			continue;
-		if (ni->server && ni->src_len != 0 &&
-		    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
-			continue;
-
-		/* Move to head of hash chain. */
-		hlist_del(&host->h_hash);
-		hlist_add_head(&host->h_hash, chain);
-
-		nlm_get_host(host);
-		dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
-				host->h_name, host->h_addrbuf);
-		goto out;
-	}
-
-	/*
-	 * The host wasn't in our hash table.  If we don't
-	 * have an NSM handle for it yet, create one.
-	 */
-	if (nsm)
+	if (nsm != NULL)
 		atomic_inc(&nsm->sm_count);
 	else {
 		host = NULL;
 		nsm = nsm_get_handle(ni->sap, ni->salen,
 					ni->hostname, ni->hostname_len);
-		if (!nsm) {
-			dprintk("lockd: nlm_lookup_host failed; "
-				"no nsm handle\n");
+		if (unlikely(nsm == NULL)) {
+			dprintk("lockd: %s failed; no nsm handle\n",
+				__func__);
 			goto out;
 		}
 	}
 
-	host = kzalloc(sizeof(*host), GFP_KERNEL);
-	if (!host) {
+	host = kmalloc(sizeof(*host), GFP_KERNEL);
+	if (unlikely(host == NULL)) {
+		dprintk("lockd: %s failed; no memory\n", __func__);
 		nsm_release(nsm);
-		dprintk("lockd: nlm_lookup_host failed; no memory\n");
 		goto out;
 	}
-	host->h_name	   = nsm->sm_name;
-	host->h_addrbuf    = nsm->sm_addrbuf;
+
 	memcpy(nlm_addr(host), ni->sap, ni->salen);
-	host->h_addrlen = ni->salen;
+	host->h_addrlen    = ni->salen;
 	rpc_set_port(nlm_addr(host), 0);
-	memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
-	host->h_srcaddrlen = ni->src_len;
+	host->h_srcaddrlen = 0;
+
+	host->h_rpcclnt    = NULL;
+	host->h_name	   = nsm->sm_name;
 	host->h_version    = ni->version;
 	host->h_proto      = ni->protocol;
-	host->h_rpcclnt    = NULL;
-	mutex_init(&host->h_mutex);
-	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
-	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
-	atomic_set(&host->h_count, 1);
+	host->h_reclaiming = 0;
+	host->h_server     = ni->server;
+	host->h_noresvport = ni->noresvport;
+	host->h_inuse      = 0;
 	init_waitqueue_head(&host->h_gracewait);
 	init_rwsem(&host->h_rwsem);
-	host->h_state      = 0;			/* pseudo NSM state */
-	host->h_nsmstate   = 0;			/* real NSM state */
-	host->h_nsmhandle  = nsm;
-	host->h_server	   = ni->server;
-	host->h_noresvport = ni->noresvport;
-	hlist_add_head(&host->h_hash, chain);
+	host->h_state      = 0;
+	host->h_nsmstate   = 0;
+	host->h_pidcount   = 0;
+	atomic_set(&host->h_count, 1);
+	mutex_init(&host->h_mutex);
+	host->h_nextrebind = now + NLM_HOST_REBIND;
+	host->h_expires    = now + NLM_HOST_EXPIRE;
 	INIT_LIST_HEAD(&host->h_lockowners);
 	spin_lock_init(&host->h_lock);
 	INIT_LIST_HEAD(&host->h_granted);
 	INIT_LIST_HEAD(&host->h_reclaim);
-
-	nrhosts++;
-
-	dprintk("lockd: nlm_lookup_host created host %s\n",
-			host->h_name);
+	host->h_nsmhandle  = nsm;
+	host->h_addrbuf    = nsm->sm_addrbuf;
 
 out:
-	mutex_unlock(&nlm_host_mutex);
 	return host;
 }
 
 /*
- * Destroy a host
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
  */
-static void
-nlm_destroy_host(struct nlm_host *host)
+static void nlm_destroy_host_locked(struct nlm_host *host)
 {
 	struct rpc_clnt	*clnt;
 
+	dprintk("lockd: destroy host %s\n", host->h_name);
+
 	BUG_ON(!list_empty(&host->h_lockowners));
 	BUG_ON(atomic_read(&host->h_count));
 
+	hlist_del_init(&host->h_hash);
+
 	nsm_unmonitor(host);
 	nsm_release(host->h_nsmhandle);
 
@@ -216,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
 	if (clnt != NULL)
 		rpc_shutdown_client(clnt);
 	kfree(host);
+
+	nrhosts--;
 }
 
 /**
@@ -249,12 +218,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
 		.hostname_len	= strlen(hostname),
 		.noresvport	= noresvport,
 	};
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host;
+	struct nsm_handle *nsm = NULL;
 
 	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
 			(hostname ? hostname : "<none>"), version,
 			(protocol == IPPROTO_UDP ? "udp" : "tcp"));
 
-	return nlm_lookup_host(&ni);
+	mutex_lock(&nlm_host_mutex);
+
+	chain = &nlm_client_hosts[nlm_hash_address(sap)];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!rpc_cmp_addr(nlm_addr(host), sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != protocol)
+			continue;
+		if (host->h_version != version)
+			continue;
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n", __func__,
+			host->h_name, host->h_addrbuf);
+		goto out;
+	}
+
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	hlist_add_head(&host->h_hash, chain);
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n", __func__,
+		host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release client host %s\n", host->h_name);
+
+	BUG_ON(atomic_read(&host->h_count) < 0);
+	BUG_ON(host->h_server);
+
+	if (atomic_dec_and_test(&host->h_count)) {
+		BUG_ON(!list_empty(&host->h_lockowners));
+		BUG_ON(!list_empty(&host->h_granted));
+		BUG_ON(!list_empty(&host->h_reclaim));
+
+		mutex_lock(&nlm_host_mutex);
+		nlm_destroy_host_locked(host);
+		mutex_unlock(&nlm_host_mutex);
+	}
 }
 
 /**
@@ -279,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 				    const char *hostname,
 				    const size_t hostname_len)
 {
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host = NULL;
+	struct nsm_handle *nsm = NULL;
 	struct sockaddr_in sin = {
 		.sin_family	= AF_INET,
 	};
 	struct sockaddr_in6 sin6 = {
 		.sin6_family	= AF_INET6,
 	};
+	struct sockaddr *src_sap;
+	size_t src_len = rqstp->rq_addrlen;
 	struct nlm_lookup_host_info ni = {
 		.server		= 1,
 		.sap		= svc_addr(rqstp),
@@ -293,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 		.version	= rqstp->rq_vers,
 		.hostname	= hostname,
 		.hostname_len	= hostname_len,
-		.src_len	= rqstp->rq_addrlen,
 	};
 
 	dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
 			(int)hostname_len, hostname, rqstp->rq_vers,
 			(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
 
+	mutex_lock(&nlm_host_mutex);
+
 	switch (ni.sap->sa_family) {
 	case AF_INET:
 		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-		ni.src_sap = (struct sockaddr *)&sin;
+		src_sap = (struct sockaddr *)&sin;
 		break;
 	case AF_INET6:
 		ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-		ni.src_sap = (struct sockaddr *)&sin6;
+		src_sap = (struct sockaddr *)&sin6;
 		break;
 	default:
-		return NULL;
+		dprintk("lockd: %s failed; unrecognized address family\n",
+			__func__);
+		goto out;
+	}
+
+	if (time_after_eq(jiffies, next_gc))
+		nlm_gc_hosts();
+
+	chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+			continue;
+
+		/* Same address. Share an NSM handle if we already have one */
+		if (nsm == NULL)
+			nsm = host->h_nsmhandle;
+
+		if (host->h_proto != ni.protocol)
+			continue;
+		if (host->h_version != ni.version)
+			continue;
+		if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+			continue;
+
+		/* Move to head of hash chain. */
+		hlist_del(&host->h_hash);
+		hlist_add_head(&host->h_hash, chain);
+
+		nlm_get_host(host);
+		dprintk("lockd: %s found host %s (%s)\n",
+			__func__, host->h_name, host->h_addrbuf);
+		goto out;
 	}
 
-	return nlm_lookup_host(&ni);
+	host = nlm_alloc_host(&ni, nsm);
+	if (unlikely(host == NULL))
+		goto out;
+
+	memcpy(nlm_srcaddr(host), src_sap, src_len);
+	host->h_srcaddrlen = src_len;
+	hlist_add_head(&host->h_hash, chain);
+	nrhosts++;
+
+	dprintk("lockd: %s created host %s (%s)\n",
+		__func__, host->h_name, host->h_addrbuf);
+
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
+}
+
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+	if (host == NULL)
+		return;
+
+	dprintk("lockd: release server host %s\n", host->h_name);
+
+	BUG_ON(atomic_read(&host->h_count) < 0);
+	BUG_ON(!host->h_server);
+	atomic_dec(&host->h_count);
 }
 
 /*
@@ -413,20 +516,28 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
 	return host;
 }
 
-/*
- * Release NLM host after use
- */
-void nlm_release_host(struct nlm_host *host)
+static struct nlm_host *next_host_state(struct hlist_head *cache,
+					struct nsm_handle *nsm,
+					const struct nlm_reboot *info)
 {
-	if (host != NULL) {
-		dprintk("lockd: release host %s\n", host->h_name);
-		BUG_ON(atomic_read(&host->h_count) < 0);
-		if (atomic_dec_and_test(&host->h_count)) {
-			BUG_ON(!list_empty(&host->h_lockowners));
-			BUG_ON(!list_empty(&host->h_granted));
-			BUG_ON(!list_empty(&host->h_reclaim));
+	struct nlm_host *host = NULL;
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+
+	mutex_lock(&nlm_host_mutex);
+	for_each_host(host, pos, chain, cache) {
+		if (host->h_nsmhandle == nsm
+		    && host->h_nsmstate != info->state) {
+			host->h_nsmstate = info->state;
+			host->h_state++;
+
+			nlm_get_host(host);
+			goto out;
 		}
 	}
+out:
+	mutex_unlock(&nlm_host_mutex);
+	return host;
 }
 
 /**
@@ -438,8 +549,6 @@ void nlm_release_host(struct nlm_host *host)
  */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-	struct hlist_head *chain;
-	struct hlist_node *pos;
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
@@ -452,32 +561,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
 	 * lock for this.
 	 * To avoid processing a host several times, we match the nsmstate.
 	 */
-again:	mutex_lock(&nlm_host_mutex);
-	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-		hlist_for_each_entry(host, pos, chain, h_hash) {
-			if (host->h_nsmhandle == nsm
-			 && host->h_nsmstate != info->state) {
-				host->h_nsmstate = info->state;
-				host->h_state++;
-
-				nlm_get_host(host);
-				mutex_unlock(&nlm_host_mutex);
-
-				if (host->h_server) {
-					/* We're server for this guy, just ditch
-					 * all the locks he held. */
-					nlmsvc_free_host_resources(host);
-				} else {
-					/* He's the server, initiate lock recovery. */
-					nlmclnt_recovery(host);
-				}
-
-				nlm_release_host(host);
-				goto again;
-			}
-		}
+	while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
+		nlmsvc_free_host_resources(host);
+		nlmsvc_release_host(host);
 	}
-	mutex_unlock(&nlm_host_mutex);
+	while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+		nlmclnt_recovery(host);
+		nlmclnt_release_host(host);
+	}
+
 	nsm_release(nsm);
 }
 
@@ -497,13 +589,11 @@ nlm_shutdown_hosts(void)
 
 	/* First, make all hosts eligible for gc */
 	dprintk("lockd: nuking all hosts...\n");
-	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-		hlist_for_each_entry(host, pos, chain, h_hash) {
-			host->h_expires = jiffies - 1;
-			if (host->h_rpcclnt) {
-				rpc_shutdown_client(host->h_rpcclnt);
-				host->h_rpcclnt = NULL;
-			}
+	for_each_host(host, pos, chain, nlm_server_hosts) {
+		host->h_expires = jiffies - 1;
+		if (host->h_rpcclnt) {
+			rpc_shutdown_client(host->h_rpcclnt);
+			host->h_rpcclnt = NULL;
 		}
 	}
 
@@ -512,15 +602,13 @@ nlm_shutdown_hosts(void)
 	mutex_unlock(&nlm_host_mutex);
 
 	/* complain if any hosts are left */
-	if (nrhosts) {
+	if (nrhosts != 0) {
 		printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-		dprintk("lockd: %d hosts left:\n", nrhosts);
-		for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-			hlist_for_each_entry(host, pos, chain, h_hash) {
-				dprintk("       %s (cnt %d use %d exp %ld)\n",
-					host->h_name, atomic_read(&host->h_count),
-					host->h_inuse, host->h_expires);
-			}
+		dprintk("lockd: %lu hosts left:\n", nrhosts);
+		for_each_host(host, pos, chain, nlm_server_hosts) {
+			dprintk("       %s (cnt %d use %d exp %ld)\n",
+				host->h_name, atomic_read(&host->h_count),
+				host->h_inuse, host->h_expires);
 		}
 	}
 }
@@ -538,29 +626,22 @@ nlm_gc_hosts(void)
 	struct nlm_host	*host;
 
 	dprintk("lockd: host garbage collection\n");
-	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-		hlist_for_each_entry(host, pos, chain, h_hash)
-			host->h_inuse = 0;
-	}
+	for_each_host(host, pos, chain, nlm_server_hosts)
+		host->h_inuse = 0;
 
 	/* Mark all hosts that hold locks, blocks or shares */
 	nlmsvc_mark_resources();
 
-	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-		hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
-			if (atomic_read(&host->h_count) || host->h_inuse
-			 || time_before(jiffies, host->h_expires)) {
-				dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
-					host->h_name, atomic_read(&host->h_count),
-					host->h_inuse, host->h_expires);
-				continue;
-			}
-			dprintk("lockd: delete host %s\n", host->h_name);
-			hlist_del_init(&host->h_hash);
-
-			nlm_destroy_host(host);
-			nrhosts--;
+	for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
+		if (atomic_read(&host->h_count) || host->h_inuse
+		 || time_before(jiffies, host->h_expires)) {
+			dprintk("nlm_gc_hosts skipping %s "
+				"(cnt %d use %d exp %ld)\n",
+				host->h_name, atomic_read(&host->h_count),
+				host->h_inuse, host->h_expires);
+			continue;
 		}
+		nlm_destroy_host_locked(host);
 	}
 
 	next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c91894964..23d7451b293 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
  * Status Monitor wire protocol.
  */
 
-static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
 	const u32 len = strlen(string);
 	__be32 *p;
 
-	if (unlikely(len > SM_MAXSTRLEN))
-		return -EIO;
-	p = xdr_reserve_space(xdr, sizeof(u32) + len);
-	if (unlikely(p == NULL))
-		return -EIO;
+	BUG_ON(len > SM_MAXSTRLEN);
+	p = xdr_reserve_space(xdr, 4 + len);
 	xdr_encode_opaque(p, string, len);
-	return 0;
 }
 
 /*
  * "mon_name" specifies the host to be monitored.
  */
-static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	return encode_nsm_string(xdr, argp->mon_name);
+	encode_nsm_string(xdr, argp->mon_name);
 }
 
 /*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
  * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
  * has changed.
  */
-static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	int status;
 	__be32 *p;
 
-	status = encode_nsm_string(xdr, utsname()->nodename);
-	if (unlikely(status != 0))
-		return status;
-	p = xdr_reserve_space(xdr, 3 * sizeof(u32));
-	if (unlikely(p == NULL))
-		return -EIO;
-	*p++ = htonl(argp->prog);
-	*p++ = htonl(argp->vers);
-	*p++ = htonl(argp->proc);
-	return 0;
+	encode_nsm_string(xdr, utsname()->nodename);
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(argp->prog);
+	*p++ = cpu_to_be32(argp->vers);
+	*p = cpu_to_be32(argp->proc);
 }
 
 /*
  * The "mon_id" argument specifies the non-private arguments
  * of an NSMPROC_MON or NSMPROC_UNMON call.
  */
-static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	int status;
-
-	status = encode_mon_name(xdr, argp);
-	if (unlikely(status != 0))
-		return status;
-	return encode_my_id(xdr, argp);
+	encode_mon_name(xdr, argp);
+	encode_my_id(xdr, argp);
 }
 
 /*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
  * by the NSMPROC_MON call. This information will be supplied in the
  * NLMPROC_SM_NOTIFY call.
  */
-static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
-	if (unlikely(p == NULL))
-		return -EIO;
 	xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
-	return 0;
 }
 
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
-		       const struct nsm_args *argp)
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			    const struct nsm_args *argp)
 {
-	struct xdr_stream xdr;
-	int status;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	status = encode_mon_id(&xdr, argp);
-	if (unlikely(status))
-		return status;
-	return encode_priv(&xdr, argp);
+	encode_mon_id(xdr, argp);
+	encode_priv(xdr, argp);
 }
 
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
-			 const struct nsm_args *argp)
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      const struct nsm_args *argp)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	return encode_mon_id(&xdr, argp);
+	encode_mon_id(xdr, argp);
 }
 
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
-			    struct nsm_res *resp)
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nsm_res *resp)
 {
-	struct xdr_stream xdr;
+	__be32 *p;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+	p = xdr_inline_decode(xdr, 4 + 4);
 	if (unlikely(p == NULL))
 		return -EIO;
-	resp->status = ntohl(*p++);
-	resp->state = ntohl(*p);
+	resp->status = be32_to_cpup(p++);
+	resp->state = be32_to_cpup(p);
 
-	dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
-			resp->status, resp->state);
+	dprintk("lockd: %s status %d state %d\n",
+		__func__, resp->status, resp->state);
 	return 0;
 }
 
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
-			struct nsm_res *resp)
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
+			    struct xdr_stream *xdr,
+			    struct nsm_res *resp)
 {
-	struct xdr_stream xdr;
+	__be32 *p;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	p = xdr_inline_decode(&xdr, sizeof(u32));
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
-	resp->state = ntohl(*p);
+	resp->state = be32_to_cpup(p);
 
-	dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+	dprintk("lockd: %s state %d\n", __func__, resp->state);
 	return 0;
 }
 
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo	nsm_procedures[] = {
 [NSMPROC_MON] = {
 		.p_proc		= NSMPROC_MON,
-		.p_encode	= (kxdrproc_t)xdr_enc_mon,
-		.p_decode	= (kxdrproc_t)xdr_dec_stat_res,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_mon,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat_res,
 		.p_arglen	= SM_mon_sz,
 		.p_replen	= SM_monres_sz,
 		.p_statidx	= NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
 	},
 [NSMPROC_UNMON] = {
 		.p_proc		= NSMPROC_UNMON,
-		.p_encode	= (kxdrproc_t)xdr_enc_unmon,
-		.p_decode	= (kxdrproc_t)xdr_dec_stat,
+		.p_encode	= (kxdreproc_t)nsm_xdr_enc_unmon,
+		.p_decode	= (kxdrdproc_t)nsm_xdr_dec_stat,
 		.p_arglen	= SM_mon_id_sz,
 		.p_replen	= SM_unmonres_sz,
 		.p_statidx	= NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 38d26119245..9a41fdc1951 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
  	if (error)
 		return error;	
 	return nlm_lck_denied_nolocks;
@@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	else
 		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_unlock(file, &argp->lock);
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlm4svc_callback_release(void *data)
 {
-	nlm_release_call(data);
+	nlmsvc_release_call(data);
 }
 
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
 
 	stat = func(rqstp, argp, &call->a_res);
 	if (stat != 0) {
-		nlm_release_call(call);
+		nlmsvc_release_call(call);
 		return stat;
 	}
 
@@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_share_file(host, file, argp);
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_unshare_file(host, file, argp);
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return rpc_success;
 
 	nlmsvc_free_host_resources(host);
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ef5659b211e..6e31695d046 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,6 +46,7 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 
 /*
  * The list of blocked locks to retry
@@ -233,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
 	kfree(block);
 failed:
-	nlm_release_call(call);
+	nlmsvc_release_call(call);
 	return NULL;
 }
 
@@ -266,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
 	mutex_unlock(&file->f_mutex);
 
 	nlmsvc_freegrantargs(block->b_call);
-	nlm_release_call(block->b_call);
+	nlmsvc_release_call(block->b_call);
 	nlm_release_file(block->b_file);
 	kfree(block->b_fl);
 	kfree(block);
@@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void)
 
 	return timeout;
 }
+
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+	/*
+	 * We can get away with a static buffer because we're only
+	 * called with BKL held.
+	 */
+	static char buf[2*NLM_MAXCOOKIELEN+1];
+	unsigned int i, len = sizeof(buf);
+	char *p = buf;
+
+	len--;	/* allow for trailing \0 */
+	if (len < 3)
+		return "???";
+	for (i = 0 ; i < cookie->len ; i++) {
+		if (len < 2) {
+			strcpy(p-3, "...");
+			break;
+		}
+		sprintf(p, "%02x", cookie->data[i]);
+		p += 2;
+		len -= 2;
+	}
+	*p = '\0';
+
+	return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0caea5310ac..d27aab11f32 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	if (error)
 		return error;
 	return nlm_lck_denied_nolocks;
@@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		dprintk("lockd: TEST          status %d vers %d\n",
 			ntohl(resp->status), rqstp->rq_vers);
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	else
 		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
 }
@@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
 
 	dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
 
 	dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 			-task->tk_status);
 }
 
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
+	nlmsvc_release_host(call->a_host);
+	kfree(call);
+}
+
 static void nlmsvc_callback_release(void *data)
 {
-	nlm_release_call(data);
+	nlmsvc_release_call(data);
 }
 
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
 
 	stat = func(rqstp, argp, &call->a_res);
 	if (stat != 0) {
-		nlm_release_call(call);
+		nlmsvc_release_call(call);
 		return stat;
 	}
 
@@ -366,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
 
 	dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -399,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
 
 	dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rpc_success;
 }
@@ -431,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return rpc_success;
 
 	nlmsvc_free_host_resources(host);
-	nlm_release_host(host);
+	nlmsvc_release_host(host);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cb..964666c68a8 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-	struct file_lock	*fl = &lock->fl;
-	__s32			start, len;
-
-	if (!(p = xdr_encode_string(p, lock->caller))
-	 || !(p = nlm_encode_fh(p, &lock->fh))
-	 || !(p = nlm_encode_oh(p, &lock->oh)))
-		return NULL;
-
-	if (fl->fl_start > NLM_OFFSET_MAX
-	 || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-		return NULL;
-
-	start = loff_t_to_s32(fl->fl_start);
-	if (fl->fl_end == OFFSET_MAX)
-		len = 0;
-	else
-		len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
-
-	*p++ = htonl(lock->svid);
-	*p++ = htonl(start);
-	*p++ = htonl(len);
-
-	return p;
-}
-
-/*
  * Encode result of a TEST/TEST_MSG call
  */
 static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
-
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
-{
-	return 0;
-}
-#endif
-
-static int
-nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	*p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-	if (!(p = nlm_encode_lock(p, lock)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-		return -EIO;
-	resp->status = *p++;
-	if (resp->status == nlm_lck_denied) {
-		struct file_lock	*fl = &resp->lock.fl;
-		u32			excl;
-		s32			start, len, end;
-
-		memset(&resp->lock, 0, sizeof(resp->lock));
-		locks_init_lock(fl);
-		excl = ntohl(*p++);
-		resp->lock.svid = ntohl(*p++);
-		fl->fl_pid = (pid_t)resp->lock.svid;
-		if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
-			return -EIO;
-
-		fl->fl_flags = FL_POSIX;
-		fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-		start = ntohl(*p++);
-		len = ntohl(*p++);
-		end = start + len - 1;
-
-		fl->fl_start = s32_to_loff_t(start);
-		if (len == 0 || end < 0)
-			fl->fl_end = OFFSET_MAX;
-		else
-			fl->fl_end = s32_to_loff_t(end);
-	}
-	return 0;
-}
-
-
-static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	*p++ = argp->block? xdr_one : xdr_zero;
-	*p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-	if (!(p = nlm_encode_lock(p, lock)))
-		return -EIO;
-	*p++ = argp->reclaim? xdr_one : xdr_zero;
-	*p++ = htonl(argp->state);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	*p++ = argp->block? xdr_one : xdr_zero;
-	*p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-	if (!(p = nlm_encode_lock(p, lock)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	if (!(p = nlm_encode_lock(p, lock)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
-		return -EIO;
-	*p++ = resp->status;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm_encode_testres(p, resp)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-		return -EIO;
-	resp->status = *p++;
-	return 0;
-}
-
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-
-/*
- * Buffer requirements for NLM
- */
-#define NLM_void_sz		0
-#define NLM_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_owner_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_fhandle_sz		1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz		3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz		4+NLM_owner_sz
-
-#define NLM_testargs_sz		NLM_cookie_sz+1+NLM_lock_sz
-#define NLM_lockargs_sz		NLM_cookie_sz+4+NLM_lock_sz
-#define NLM_cancargs_sz		NLM_cookie_sz+2+NLM_lock_sz
-#define NLM_unlockargs_sz	NLM_cookie_sz+NLM_lock_sz
-
-#define NLM_testres_sz		NLM_cookie_sz+1+NLM_holder_sz
-#define NLM_res_sz		NLM_cookie_sz+1
-#define NLM_norep_sz		0
-
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlmclt_decode_norep	NULL
-
-#define PROC(proc, argtype, restype)	\
-[NLMPROC_##proc] = {							\
-	.p_proc      = NLMPROC_##proc,					\
-	.p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,		\
-	.p_decode    = (kxdrproc_t) nlmclt_decode_##restype,		\
-	.p_arglen    = NLM_##argtype##_sz,				\
-	.p_replen    = NLM_##restype##_sz,				\
-	.p_statidx   = NLMPROC_##proc,					\
-	.p_name      = #proc,						\
-	}
-
-static struct rpc_procinfo	nlm_procedures[] = {
-    PROC(TEST,		testargs,	testres),
-    PROC(LOCK,		lockargs,	res),
-    PROC(CANCEL,	cancargs,	res),
-    PROC(UNLOCK,	unlockargs,	res),
-    PROC(GRANTED,	testargs,	res),
-    PROC(TEST_MSG,	testargs,	norep),
-    PROC(LOCK_MSG,	lockargs,	norep),
-    PROC(CANCEL_MSG,	cancargs,	norep),
-    PROC(UNLOCK_MSG,	unlockargs,	norep),
-    PROC(GRANTED_MSG,	testargs,	norep),
-    PROC(TEST_RES,	testres,	norep),
-    PROC(LOCK_RES,	res,		norep),
-    PROC(CANCEL_RES,	res,		norep),
-    PROC(UNLOCK_RES,	res,		norep),
-    PROC(GRANTED_RES,	res,		norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,		shareargs,	shareres),
-    PROC(UNSHARE,	shareargs,	shareres),
-    PROC(NM_LOCK,	lockargs,	res),
-    PROC(FREE_ALL,	notify,		void),
-#endif
-};
-
-static struct rpc_version	nlm_version1 = {
-		.number		= 1,
-		.nrprocs	= 16,
-		.procs		= nlm_procedures,
-};
-
-static struct rpc_version	nlm_version3 = {
-		.number		= 3,
-		.nrprocs	= 24,
-		.procs		= nlm_procedures,
-};
-
-static struct rpc_version *	nlm_versions[] = {
-	[1] = &nlm_version1,
-	[3] = &nlm_version3,
-#ifdef 	CONFIG_LOCKD_V4
-	[4] = &nlm_version4,
-#endif
-};
-
-static struct rpc_stat		nlm_stats;
-
-struct rpc_program		nlm_program = {
-		.name		= "lockd",
-		.number		= NLM_PROGRAM,
-		.nrvers		= ARRAY_SIZE(nlm_versions),
-		.version	= nlm_versions,
-		.stats		= &nlm_stats,
-};
-
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-	/*
-	 * We can get away with a static buffer because we're only
-	 * called with BKL held.
-	 */
-	static char buf[2*NLM_MAXCOOKIELEN+1];
-	unsigned int i, len = sizeof(buf);
-	char *p = buf;
-
-	len--;	/* allow for trailing \0 */
-	if (len < 3)
-		return "???";
-	for (i = 0 ; i < cookie->len ; i++) {
-		if (len < 2) {
-			strcpy(p-3, "...");
-			break;
-		}
-		sprintf(p, "%02x", cookie->data[i]);
-		p += 2;
-		len -= 2;
-	}
-	*p = '\0';
-
-	return buf;
-}
-#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145..dfa4789cd46 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
 	return p + XDR_QUADLEN(f->size);
 }
 
-static __be32 *
-nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-	*p++ = htonl(f->size);
-	if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
-	memcpy(p, f->data, f->size);
-	return p + XDR_QUADLEN(f->size);
-}
-
 /*
  * Encode and decode owner handle
  */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 }
 
 static __be32 *
-nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
-	return xdr_encode_netobj(p, oh);
-}
-
-static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-	struct file_lock	*fl = &lock->fl;
-	__s64			start, len;
-
-	if (!(p = xdr_encode_string(p, lock->caller))
-	 || !(p = nlm4_encode_fh(p, &lock->fh))
-	 || !(p = nlm4_encode_oh(p, &lock->oh)))
-		return NULL;
-
-	if (fl->fl_start > NLM4_OFFSET_MAX
-	 || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-		return NULL;
-
-	*p++ = htonl(lock->svid);
-
-	start = loff_t_to_s64(fl->fl_start);
-	if (fl->fl_end == OFFSET_MAX)
-		len = 0;
-	else
-		len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-
-	p = xdr_encode_hyper(p, start);
-	p = xdr_encode_hyper(p, len);
-
-	return p;
-}
-
-/*
  * Encode result of a TEST/TEST_MSG call
  */
 static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
-
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
-{
-	return 0;
-}
-#endif
-
-static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	*p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-	if (!(p = nlm4_encode_lock(p, lock)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-		return -EIO;
-	resp->status = *p++;
-	if (resp->status == nlm_lck_denied) {
-		struct file_lock	*fl = &resp->lock.fl;
-		u32			excl;
-		__u64			start, len;
-		__s64			end;
-
-		memset(&resp->lock, 0, sizeof(resp->lock));
-		locks_init_lock(fl);
-		excl = ntohl(*p++);
-		resp->lock.svid = ntohl(*p++);
-		fl->fl_pid = (pid_t)resp->lock.svid;
-		if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
-			return -EIO;
-
-		fl->fl_flags = FL_POSIX;
-		fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-		p = xdr_decode_hyper(p, &start);
-		p = xdr_decode_hyper(p, &len);
-		end = start + len - 1;
-
-		fl->fl_start = s64_to_loff_t(start);
-		if (len == 0 || end < 0)
-			fl->fl_end = OFFSET_MAX;
-		else
-			fl->fl_end = s64_to_loff_t(end);
-	}
-	return 0;
-}
-
-
-static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	*p++ = argp->block? xdr_one : xdr_zero;
-	*p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-	if (!(p = nlm4_encode_lock(p, lock)))
-		return -EIO;
-	*p++ = argp->reclaim? xdr_one : xdr_zero;
-	*p++ = htonl(argp->state);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	*p++ = argp->block? xdr_one : xdr_zero;
-	*p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-	if (!(p = nlm4_encode_lock(p, lock)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-	struct nlm_lock	*lock = &argp->lock;
-
-	if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-		return -EIO;
-	if (!(p = nlm4_encode_lock(p, lock)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
-		return -EIO;
-	*p++ = resp->status;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm4_encode_testres(p, resp)))
-		return -EIO;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
-}
-
-static int
-nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-		return -EIO;
-	resp->status = *p++;
-	return 0;
-}
-
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-
-#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
-#  error "NLM host name cannot be larger than NLM's maximum string length!"
-#endif
-
-/*
- * Buffer requirements for NLM
- */
-#define NLM4_void_sz		0
-#define NLM4_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_owner_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_fhandle_sz		1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz		5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz		6+NLM4_owner_sz
-
-#define NLM4_testargs_sz	NLM4_cookie_sz+1+NLM4_lock_sz
-#define NLM4_lockargs_sz	NLM4_cookie_sz+4+NLM4_lock_sz
-#define NLM4_cancargs_sz	NLM4_cookie_sz+2+NLM4_lock_sz
-#define NLM4_unlockargs_sz	NLM4_cookie_sz+NLM4_lock_sz
-
-#define NLM4_testres_sz		NLM4_cookie_sz+1+NLM4_holder_sz
-#define NLM4_res_sz		NLM4_cookie_sz+1
-#define NLM4_norep_sz		0
-
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlm4clt_decode_norep	NULL
-
-#define PROC(proc, argtype, restype)					\
-[NLMPROC_##proc] = {							\
-	.p_proc      = NLMPROC_##proc,					\
-	.p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,		\
-	.p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,		\
-	.p_arglen    = NLM4_##argtype##_sz,				\
-	.p_replen    = NLM4_##restype##_sz,				\
-	.p_statidx   = NLMPROC_##proc,					\
-	.p_name      = #proc,						\
-	}
-
-static struct rpc_procinfo	nlm4_procedures[] = {
-    PROC(TEST,		testargs,	testres),
-    PROC(LOCK,		lockargs,	res),
-    PROC(CANCEL,	cancargs,	res),
-    PROC(UNLOCK,	unlockargs,	res),
-    PROC(GRANTED,	testargs,	res),
-    PROC(TEST_MSG,	testargs,	norep),
-    PROC(LOCK_MSG,	lockargs,	norep),
-    PROC(CANCEL_MSG,	cancargs,	norep),
-    PROC(UNLOCK_MSG,	unlockargs,	norep),
-    PROC(GRANTED_MSG,	testargs,	norep),
-    PROC(TEST_RES,	testres,	norep),
-    PROC(LOCK_RES,	res,		norep),
-    PROC(CANCEL_RES,	res,		norep),
-    PROC(UNLOCK_RES,	res,		norep),
-    PROC(GRANTED_RES,	res,		norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,		shareargs,	shareres),
-    PROC(UNSHARE,	shareargs,	shareres),
-    PROC(NM_LOCK,	lockargs,	res),
-    PROC(FREE_ALL,	notify,		void),
-#endif
-};
-
-struct rpc_version	nlm_version4 = {
-	.number		= 4,
-	.nrprocs	= 24,
-	.procs		= nlm4_procedures,
-};
diff --git a/fs/locks.c b/fs/locks.c
index 08415b2a6d3..0f3998291f7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
 	fl->fl_file->f_owner.signum = 0;
 }
 
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
-	return fl->fl_file == try->fl_file;
-}
-
 static const struct lock_manager_operations lease_manager_ops = {
 	.fl_break = lease_break_callback,
 	.fl_release_private = lease_release_private_callback,
-	.fl_mylease = lease_mylease_callback,
 	.fl_change = lease_modify,
 };
 
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
-		if (lease->fl_lmops->fl_mylease(fl, lease))
+		if (fl->fl_file == filp)
 			my_before = before;
 		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
 			/*
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09b..723bc5bca09 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 
 static void bdev_put_device(struct logfs_super *s)
 {
-	close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 {
 	struct block_device *bdev;
 
-	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 		return logfs_get_sb_mtd(p, mtdnr);
 	}
 
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237..a25444ab2ba 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
 
-struct mb_cache {
-	struct list_head		c_cache_list;
-	const char			*c_name;
-	atomic_t			c_entry_count;
-	int				c_max_entries;
-	int				c_bucket_bits;
-	struct kmem_cache		*c_entry_cache;
-	struct list_head		*c_block_hash;
-	struct list_head		*c_index_hash;
-};
-
-
 /*
  * Global data: list of all mbcache's, lru list, and a spinlock for
  * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1b9e07728a9..ce7337ddfdb 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
 	struct inode * inode = NULL;
 	ino_t ino;
 
-	d_set_d_op(dentry, dir->i_sb->s_root->d_op);
-
 	if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
 		return ERR_PTR(-ENAMETOOLONG);
 
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea55..d78455a81ec 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
-	} while (bvec >= bio->bi_io_vec);
-	bio_put(bio);
-}
-
-static void mpage_end_io_write(struct bio *bio, int err)
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
-	do {
-		struct page *page = bvec->bv_page;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (!uptodate){
-			SetPageError(page);
-			if (page->mapping)
-				set_bit(AS_EIO, &page->mapping->flags);
+		if (bio_data_dir(bio) == READ) {
+			if (uptodate) {
+				SetPageUptodate(page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			unlock_page(page);
+		} else { /* bio_data_dir(bio) == WRITE */
+			if (!uptodate) {
+				SetPageError(page);
+				if (page->mapping)
+					set_bit(AS_EIO, &page->mapping->flags);
+			}
+			end_page_writeback(page);
 		}
-		end_page_writeback(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 }
 
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-	bio->bi_end_io = mpage_end_io_read;
-	if (rw == WRITE)
-		bio->bi_end_io = mpage_end_io_write;
+	bio->bi_end_io = mpage_end_io;
 	submit_bio(rw, bio);
 	return NULL;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 19433cdba01..7d77f24d32a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -202,7 +202,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
  * @inode:	inode to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  * @check_acl:	optional callback to check for Posix ACLs
- * @flags	IPERM_FLAG_ flags.
+ * @flags:	IPERM_FLAG_ flags.
  *
  * Used to check for read/write/execute permissions on a file.
  * We use "fsuid" for this, letting us set arbitrary permissions
@@ -368,18 +368,6 @@ void path_get(struct path *path)
 EXPORT_SYMBOL(path_get);
 
 /**
- * path_get_long - get a long reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get_long(struct path *path)
-{
-	mntget_long(path->mnt);
-	dget(path->dentry);
-}
-
-/**
  * path_put - put a reference to a path
  * @path: path to put the reference to
  *
@@ -393,21 +381,9 @@ void path_put(struct path *path)
 EXPORT_SYMBOL(path_put);
 
 /**
- * path_put_long - put a long reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put_long(struct path *path)
-{
-	dput(path->dentry);
-	mntput_long(path->mnt);
-}
-
-/**
  * nameidata_drop_rcu - drop this nameidata out of rcu-walk
  * @nd: nameidata pathwalk data to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
  *
  * Path walking has 2 modes, rcu-walk and ref-walk (see
  * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
@@ -468,7 +444,7 @@ static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
  * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
  * @nd: nameidata pathwalk data to drop
  * @dentry: dentry to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
  *
  * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
  * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
@@ -479,6 +455,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
 
+	/*
+	 * It can be possible to revalidate the dentry that we started
+	 * the path walk with. force_reval_path may also revalidate the
+	 * dentry already committed to the nameidata.
+	 */
+	if (unlikely(parent == dentry))
+		return nameidata_drop_rcu(nd);
+
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 	if (nd->root.mnt) {
 		spin_lock(&fs->lock);
@@ -530,7 +514,7 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
 /**
  * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
  * @nd: nameidata pathwalk data to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
  *
  * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
  * nd->path should be the final element of the lookup, so nd->root is discarded.
@@ -583,6 +567,13 @@ void release_open_intent(struct nameidata *nd)
 		fput(nd->intent.open.file);
 }
 
+/*
+ * Call d_revalidate and handle filesystems that request rcu-walk
+ * to be dropped. This may be called and return in rcu-walk mode,
+ * regardless of success or error. If -ECHILD is returned, the caller
+ * must return -ECHILD back up the path walk stack so path walk may
+ * be restarted in ref-walk mode.
+ */
 static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	int status;
@@ -673,6 +664,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
 		return 0;
 
 	if (!status) {
+		/* Don't d_invalidate in rcu-walk mode */
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
 		d_invalidate(dentry);
 		status = -ESTALE;
 	}
@@ -761,7 +755,8 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 		mntput(path->mnt);
 }
 
-static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+static inline void path_to_nameidata(const struct path *path,
+					struct nameidata *nd)
 {
 	if (!(nd->flags & LOOKUP_RCU)) {
 		dput(nd->path.dentry);
@@ -773,20 +768,16 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 }
 
 static __always_inline int
-__do_follow_link(struct path *path, struct nameidata *nd, void **p)
+__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 {
 	int error;
-	struct dentry *dentry = path->dentry;
+	struct dentry *dentry = link->dentry;
 
-	touch_atime(path->mnt, dentry);
+	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
-	if (path->mnt != nd->path.mnt) {
-		path_to_nameidata(path, nd);
-		nd->inode = nd->path.dentry->d_inode;
-		dget(dentry);
-	}
-	mntget(path->mnt);
+	if (link->mnt == nd->path.mnt)
+		mntget(link->mnt);
 
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -877,54 +868,148 @@ int follow_up(struct path *path)
 }
 
 /*
- * serialization is taken care of in namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
  */
-static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
-				struct inode **inode)
+static int follow_automount(struct path *path, unsigned flags,
+			    bool *need_mntput)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted;
-		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
-		if (!mounted)
-			return;
-		path->mnt = mounted;
-		path->dentry = mounted->mnt_root;
-		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-		*inode = path->dentry->d_inode;
+	struct vfsmount *mnt;
+	int err;
+
+	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+		return -EREMOTE;
+
+	/* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+	 * and this is the terminal part of the path.
+	 */
+	if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+		return -EISDIR; /* we actually want to stop here */
+
+	/* We want to mount if someone is trying to open/create a file of any
+	 * type under the mountpoint, wants to traverse through the mountpoint
+	 * or wants to open the mounted directory.
+	 *
+	 * We don't want to mount if someone's just doing a stat and they've
+	 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+	 * appended a '/' to the name.
+	 */
+	if (!(flags & LOOKUP_FOLLOW) &&
+	    !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+		       LOOKUP_OPEN | LOOKUP_CREATE)))
+		return -EISDIR;
+
+	current->total_link_count++;
+	if (current->total_link_count >= 40)
+		return -ELOOP;
+
+	mnt = path->dentry->d_op->d_automount(path);
+	if (IS_ERR(mnt)) {
+		/*
+		 * The filesystem is allowed to return -EISDIR here to indicate
+		 * it doesn't want to automount.  For instance, autofs would do
+		 * this so that its userspace daemon can mount on this dentry.
+		 *
+		 * However, we can only permit this if it's a terminal point in
+		 * the path being looked up; if it wasn't then the remainder of
+		 * the path is inaccessible and we should say so.
+		 */
+		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+			return -EREMOTE;
+		return PTR_ERR(mnt);
 	}
-}
 
-static int __follow_mount(struct path *path)
-{
-	int res = 0;
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
+	if (!mnt) /* mount collision */
+		return 0;
+
+	err = finish_automount(mnt, path);
+
+	switch (err) {
+	case -EBUSY:
+		/* Someone else made a mount here whilst we were busy */
+		return 0;
+	case 0:
 		dput(path->dentry);
-		if (res)
+		if (*need_mntput)
 			mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
-		res = 1;
+		path->mnt = mnt;
+		path->dentry = dget(mnt->mnt_root);
+		*need_mntput = true;
+		return 0;
+	default:
+		return err;
 	}
-	return res;
+
 }
 
-static void follow_mount(struct path *path)
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
 {
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
+	unsigned managed;
+	bool need_mntput = false;
+	int ret;
+
+	/* Given that we're not holding a lock here, we retain the value in a
+	 * local variable for each dentry as we look at it so that we don't see
+	 * the components of that value change under us */
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       managed &= DCACHE_MANAGED_DENTRY,
+	       unlikely(managed != 0)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held. */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(path->dentry,
+							   false, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (mounted) {
+				dput(path->dentry);
+				if (need_mntput)
+					mntput(path->mnt);
+				path->mnt = mounted;
+				path->dentry = dget(mounted->mnt_root);
+				need_mntput = true;
+				continue;
+			}
+
+			/* Something is mounted on this dentry in another
+			 * namespace and/or whatever was mounted there in this
+			 * namespace got unmounted before we managed to get the
+			 * vfsmount_lock */
+		}
+
+		/* Handle an automount point */
+		if (managed & DCACHE_NEED_AUTOMOUNT) {
+			ret = follow_automount(path, flags, &need_mntput);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+			continue;
+		}
+
+		/* We didn't change the current path point */
+		break;
 	}
+	return 0;
 }
 
-int follow_down(struct path *path)
+int follow_down_one(struct path *path)
 {
 	struct vfsmount *mounted;
 
@@ -939,13 +1024,41 @@ int follow_down(struct path *path)
 	return 0;
 }
 
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet a managed dentry and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode, bool reverse_transit)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted;
+		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+		    !reverse_transit &&
+		    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+			return false;
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			break;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*inode = path->dentry->d_inode;
+	}
+
+	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+		return reverse_transit;
+	return true;
+}
+
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
 
 	set_root_rcu(nd);
 
-	while(1) {
+	while (1) {
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
@@ -968,12 +1081,80 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		inode = nd->path.dentry->d_inode;
 	}
-	__follow_mount_rcu(nd, &nd->path, &inode);
+	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
 
 	return 0;
 }
 
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
+ */
+int follow_down(struct path *path, bool mounting_here)
+{
+	unsigned managed;
+	int ret;
+
+	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+		/* Allow the filesystem to manage the transit without i_mutex
+		 * being held.
+		 *
+		 * We indicate to the filesystem if someone is trying to mount
+		 * something here.  This gives autofs the chance to deny anyone
+		 * other than its daemon the right to mount on its
+		 * superstructure.
+		 *
+		 * The filesystem may sleep at this point.
+		 */
+		if (managed & DCACHE_MANAGE_TRANSIT) {
+			BUG_ON(!path->dentry->d_op);
+			BUG_ON(!path->dentry->d_op->d_manage);
+			ret = path->dentry->d_op->d_manage(
+				path->dentry, mounting_here, false);
+			if (ret < 0)
+				return ret == -EISDIR ? 0 : ret;
+		}
+
+		/* Transit to a mounted filesystem. */
+		if (managed & DCACHE_MOUNTED) {
+			struct vfsmount *mounted = lookup_mnt(path);
+			if (!mounted)
+				break;
+			dput(path->dentry);
+			mntput(path->mnt);
+			path->mnt = mounted;
+			path->dentry = dget(mounted->mnt_root);
+			continue;
+		}
+
+		/* Don't handle automount points here */
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path);
+		if (!mounted)
+			break;
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
+	}
+}
+
 static void follow_dotdot(struct nameidata *nd)
 {
 	set_root(nd);
@@ -1038,12 +1219,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
 	struct inode *dir;
+	int err;
+
 	/*
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
 	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		int err = parent->d_op->d_hash(parent, nd->inode, name);
+		err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -1070,22 +1253,30 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		nd->seq = seq;
 		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
 			goto need_revalidate;
+done2:
 		path->mnt = mnt;
 		path->dentry = dentry;
-		__follow_mount_rcu(nd, path, inode);
-	} else {
-		dentry = __d_lookup(parent, name);
-		if (!dentry)
-			goto need_lookup;
+		if (likely(__follow_mount_rcu(nd, path, inode, false)))
+			return 0;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+		/* fallthru */
+	}
+	dentry = __d_lookup(parent, name);
+	if (!dentry)
+		goto need_lookup;
 found:
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-			goto need_revalidate;
+	if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+		goto need_revalidate;
 done:
-		path->mnt = mnt;
-		path->dentry = dentry;
-		__follow_mount(path);
-		*inode = path->dentry->d_inode;
-	}
+	path->mnt = mnt;
+	path->dentry = dentry;
+	err = follow_managed(path, nd->flags);
+	if (unlikely(err < 0)) {
+		path_put_conditional(path, nd);
+		return err;
+	}
+	*inode = path->dentry->d_inode;
 	return 0;
 
 need_lookup:
@@ -1124,6 +1315,8 @@ need_revalidate:
 		goto need_lookup;
 	if (IS_ERR(dentry))
 		goto fail;
+	if (nd->flags & LOOKUP_RCU)
+		goto done2;
 	goto done;
 
 fail:
@@ -1131,17 +1324,6 @@ fail:
 }
 
 /*
- * This is a temporary kludge to deal with "automount" symlinks; proper
- * solution is to trigger them on follow_mount(), so that do_lookup()
- * would DTRT.  To be killed before 2.6.34-final.
- */
-static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
-{
-	return inode && unlikely(inode->i_op->follow_link) &&
-		((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
-}
-
-/*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
  * the final dentry. We expect 'base' to be positive and a directory.
@@ -1279,7 +1461,8 @@ last_component:
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		if (follow_on_final(inode, lookup_flags)) {
+		if (inode && unlikely(inode->i_op->follow_link) &&
+		    (lookup_flags & LOOKUP_FOLLOW)) {
 			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
 				return -ECHILD;
 			BUG_ON(inode != next.dentry->d_inode);
@@ -1950,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
 	return break_lease(inode, flag);
 }
 
-static int handle_truncate(struct path *path)
+static int handle_truncate(struct file *filp)
 {
+	struct path *path = &filp->f_path;
 	struct inode *inode = path->dentry->d_inode;
 	int error = get_write_access(inode);
 	if (error)
@@ -1965,7 +2149,7 @@ static int handle_truncate(struct path *path)
 	if (!error) {
 		error = do_truncate(path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-				    NULL);
+				    filp);
 	}
 	put_write_access(inode);
 	return error;
@@ -2063,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
 	}
 	if (!IS_ERR(filp)) {
 		if (will_truncate) {
-			error = handle_truncate(&nd->path);
+			error = handle_truncate(filp);
 			if (error) {
 				fput(filp);
 				filp = ERR_PTR(error);
@@ -2104,11 +2288,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		dir = nd->path.dentry;
 	case LAST_DOT:
 		if (need_reval_dot(dir)) {
-			error = d_revalidate(nd->path.dentry, nd);
-			if (!error)
-				error = -ESTALE;
-			if (error < 0)
+			int status = d_revalidate(nd->path.dentry, nd);
+			if (!status)
+				status = -ESTALE;
+			if (status < 0) {
+				error = status;
 				goto exit;
+			}
 		}
 		/* fallthrough */
 	case LAST_ROOT:
@@ -2178,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (open_flag & O_EXCL)
 		goto exit_dput;
 
-	if (__follow_mount(path)) {
-		error = -ELOOP;
-		if (open_flag & O_NOFOLLOW)
-			goto exit_dput;
-	}
+	error = follow_managed(path, nd->flags);
+	if (error < 0)
+		goto exit_dput;
 
 	error = -ENOENT;
 	if (!path->dentry->d_inode)
@@ -2327,11 +2511,11 @@ reval:
 	nd.flags = flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
-		struct path holder;
+		struct path link = path;
+		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
 		error = -ELOOP;
-		/* S_ISDIR part is a temporary automount kludge */
-		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
+		if (!(nd.flags & LOOKUP_FOLLOW))
 			goto exit_dput;
 		if (count++ == 32)
 			goto exit_dput;
@@ -2347,23 +2531,22 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
-		error = security_inode_follow_link(path.dentry, &nd);
+		error = security_inode_follow_link(link.dentry, &nd);
 		if (error)
 			goto exit_dput;
-		error = __do_follow_link(&path, &nd, &cookie);
+		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
-			if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
-				nd.inode->i_op->put_link(path.dentry, &nd, cookie);
+			if (!IS_ERR(cookie) && linki->i_op->put_link)
+				linki->i_op->put_link(link.dentry, &nd, cookie);
 			/* nd.path had been dropped */
-			nd.path = path;
+			nd.path = link;
 			goto out_path;
 		}
-		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
 		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (nd.inode->i_op->put_link)
-			nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
-		path_put(&holder);
+		if (linki->i_op->put_link)
+			linki->i_op->put_link(link.dentry, &nd, cookie);
+		path_put(&link);
 	}
 out:
 	if (nd.root.mnt)
@@ -3391,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c4..7b0b9537169 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
 unsigned int mnt_get_count(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-	unsigned int count = atomic_read(&mnt->mnt_longrefs);
+	unsigned int count = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 
-		atomic_set(&mnt->mnt_longrefs, 1);
+		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
@@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 	list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
 }
 
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	atomic_dec(&mnt->mnt_longterm);
+#endif
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -624,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
 	BUG_ON(parent == mnt);
 
 	list_add_tail(&head, &mnt->mnt_list);
-	list_for_each_entry(m, &head, mnt_list)
+	list_for_each_entry(m, &head, mnt_list) {
 		m->mnt_ns = n;
+		__mnt_make_longterm(m);
+	}
+
 	list_splice(&head, n->list.prev);
 
 	list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +752,30 @@ static inline void mntfree(struct vfsmount *mnt)
 	deactivate_super(sb);
 }
 
-#ifdef CONFIG_SMP
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
+static void mntput_no_expire(struct vfsmount *mnt)
 {
-	if (!longrefs) {
 put_again:
-		br_read_lock(vfsmount_lock);
-		if (likely(atomic_read(&mnt->mnt_longrefs))) {
-			mnt_dec_count(mnt);
-			br_read_unlock(vfsmount_lock);
-			return;
-		}
+#ifdef CONFIG_SMP
+	br_read_lock(vfsmount_lock);
+	if (likely(atomic_read(&mnt->mnt_longterm))) {
+		mnt_dec_count(mnt);
 		br_read_unlock(vfsmount_lock);
-	} else {
-		BUG_ON(!atomic_read(&mnt->mnt_longrefs));
-		if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
-			return;
+		return;
 	}
+	br_read_unlock(vfsmount_lock);
 
 	br_write_lock(vfsmount_lock);
-	if (!longrefs)
-		mnt_dec_count(mnt);
-	else
-		atomic_dec(&mnt->mnt_longrefs);
+	mnt_dec_count(mnt);
 	if (mnt_get_count(mnt)) {
 		br_write_unlock(vfsmount_lock);
 		return;
 	}
-	if (unlikely(mnt->mnt_pinned)) {
-		mnt_add_count(mnt, mnt->mnt_pinned + 1);
-		mnt->mnt_pinned = 0;
-		br_write_unlock(vfsmount_lock);
-		acct_auto_close_mnt(mnt);
-		goto put_again;
-	}
-	br_write_unlock(vfsmount_lock);
-	mntfree(mnt);
-}
 #else
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
-{
-put_again:
 	mnt_dec_count(mnt);
 	if (likely(mnt_get_count(mnt)))
 		return;
 	br_write_lock(vfsmount_lock);
+#endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
@@ -789,12 +786,6 @@ put_again:
 	br_write_unlock(vfsmount_lock);
 	mntfree(mnt);
 }
-#endif
-
-static void mntput_no_expire(struct vfsmount *mnt)
-{
-	__mntput(mnt, 0);
-}
 
 void mntput(struct vfsmount *mnt)
 {
@@ -802,7 +793,7 @@ void mntput(struct vfsmount *mnt)
 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 		if (unlikely(mnt->mnt_expiry_mark))
 			mnt->mnt_expiry_mark = 0;
-		__mntput(mnt, 0);
+		mntput_no_expire(mnt);
 	}
 }
 EXPORT_SYMBOL(mntput);
@@ -815,33 +806,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-void mntput_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	if (mnt) {
-		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
-		if (unlikely(mnt->mnt_expiry_mark))
-			mnt->mnt_expiry_mark = 0;
-		__mntput(mnt, 1);
-	}
-#else
-	mntput(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntput_long);
-
-struct vfsmount *mntget_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-	if (mnt)
-		atomic_inc(&mnt->mnt_longrefs);
-	return mnt;
-#else
-	return mntget(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntget_long);
-
 void mnt_pin(struct vfsmount *mnt)
 {
 	br_write_lock(vfsmount_lock);
@@ -1216,7 +1180,7 @@ void release_mounts(struct list_head *head)
 			dput(dentry);
 			mntput(m);
 		}
-		mntput_long(mnt);
+		mntput(mnt);
 	}
 }
 
@@ -1226,19 +1190,21 @@ void release_mounts(struct list_head *head)
  */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
+	LIST_HEAD(tmp_list);
 	struct vfsmount *p;
 
 	for (p = mnt; p; p = next_mnt(p, mnt))
-		list_move(&p->mnt_hash, kill);
+		list_move(&p->mnt_hash, &tmp_list);
 
 	if (propagate)
-		propagate_umount(kill);
+		propagate_umount(&tmp_list);
 
-	list_for_each_entry(p, kill, mnt_hash) {
+	list_for_each_entry(p, &tmp_list, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
+		__mnt_make_shortterm(p);
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p) {
 			p->mnt_parent->mnt_ghosts++;
@@ -1246,6 +1212,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
+	list_splice(&tmp_list, kill);
 }
 
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1844,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
 		return err;
 
 	down_write(&namespace_sem);
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto out;
+
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
@@ -1904,6 +1872,8 @@ out:
 	return err;
 }
 
+static int do_add_mount(struct vfsmount *, struct path *, int);
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
@@ -1912,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 			int mnt_flags, char *name, void *data)
 {
 	struct vfsmount *mnt;
+	int err;
 
 	if (!type)
 		return -EINVAL;
@@ -1924,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, path, mnt_flags, NULL);
+	err = do_add_mount(mnt, path, mnt_flags);
+	if (err)
+		mntput(mnt);
+	return err;
+}
+
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+	int err;
+	/* The new mount record should have at least 2 refs to prevent it being
+	 * expired before we get a chance to add it
+	 */
+	BUG_ON(mnt_get_count(m) < 2);
+
+	if (m->mnt_sb == path->mnt->mnt_sb &&
+	    m->mnt_root == path->dentry) {
+		err = -ELOOP;
+		goto fail;
+	}
+
+	err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	if (!err)
+		return 0;
+fail:
+	/* remove m from any expiration list it may be on */
+	if (!list_empty(&m->mnt_expire)) {
+		down_write(&namespace_sem);
+		br_write_lock(vfsmount_lock);
+		list_del_init(&m->mnt_expire);
+		br_write_unlock(vfsmount_lock);
+		up_write(&namespace_sem);
+	}
+	mntput(m);
+	mntput(m);
+	return err;
 }
 
 /*
  * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
  */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
-		 int mnt_flags, struct list_head *fslist)
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
 {
 	int err;
 
@@ -1940,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
-	while (d_mountpoint(path->dentry) &&
-	       follow_down(path))
-		;
+	err = follow_down(path, true);
+	if (err < 0)
+		goto unlock;
+
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
 		goto unlock;
@@ -1958,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 		goto unlock;
 
 	newmnt->mnt_flags = mnt_flags;
-	if ((err = graft_tree(newmnt, path)))
-		goto unlock;
-
-	if (fslist) /* add to the specified expiration list */
-		list_add_tail(&newmnt->mnt_expire, fslist);
-
-	up_write(&namespace_sem);
-	return 0;
+	err = graft_tree(newmnt, path);
 
 unlock:
 	up_write(&namespace_sem);
-	mntput_long(newmnt);
 	return err;
 }
 
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+	down_write(&namespace_sem);
+	br_write_lock(vfsmount_lock);
+
+	list_add_tail(&mnt->mnt_expire, expiry_list);
+
+	br_write_unlock(vfsmount_lock);
+	up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
 
 /*
  * process a list of expirable mountpoints with the intent of discarding any
@@ -2262,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 	return new_ns;
 }
 
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+	__mnt_make_longterm(mnt);
+}
+
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+		return;
+	br_write_lock(vfsmount_lock);
+	atomic_dec(&mnt->mnt_longterm);
+	br_write_unlock(vfsmount_lock);
+#endif
+}
+
 /*
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
@@ -2299,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	q = new_ns->root;
 	while (p) {
 		q->mnt_ns = new_ns;
+		__mnt_make_longterm(q);
 		if (fs) {
 			if (p == fs->root.mnt) {
+				fs->root.mnt = mntget(q);
+				__mnt_make_longterm(q);
+				mnt_make_shortterm(p);
 				rootmnt = p;
-				fs->root.mnt = mntget_long(q);
 			}
 			if (p == fs->pwd.mnt) {
+				fs->pwd.mnt = mntget(q);
+				__mnt_make_longterm(q);
+				mnt_make_shortterm(p);
 				pwdmnt = p;
-				fs->pwd.mnt = mntget_long(q);
 			}
 		}
 		p = next_mnt(p, mnt_ns->root);
@@ -2315,9 +2347,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	up_write(&namespace_sem);
 
 	if (rootmnt)
-		mntput_long(rootmnt);
+		mntput(rootmnt);
 	if (pwdmnt)
-		mntput_long(pwdmnt);
+		mntput(pwdmnt);
 
 	return new_ns;
 }
@@ -2350,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
 	new_ns = alloc_mnt_ns();
 	if (!IS_ERR(new_ns)) {
 		mnt->mnt_ns = new_ns;
+		__mnt_make_longterm(mnt);
 		new_ns->root = mnt;
 		list_add(&new_ns->list, &new_ns->root->mnt_list);
 	}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 28f136d4aae..f6946bb5cb5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -21,9 +21,7 @@
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 
-#include <linux/ncp_fs.h>
-
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 static void ncp_read_volume_list(struct file *, void *, filldir_t,
 				struct ncp_cache_control *);
@@ -82,7 +80,7 @@ static int ncp_compare_dentry(const struct dentry *, const struct inode *,
 		unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
 
-static const struct dentry_operations ncp_dentry_operations =
+const struct dentry_operations ncp_dentry_operations =
 {
 	.d_revalidate	= ncp_lookup_validate,
 	.d_hash		= ncp_hash_dentry,
@@ -90,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations =
 	.d_delete	= ncp_delete_dentry,
 };
 
-const struct dentry_operations ncp_root_dentry_operations =
-{
-	.d_hash		= ncp_hash_dentry,
-	.d_compare	= ncp_compare_dentry,
-	.d_delete	= ncp_delete_dentry,
-};
-
-
 #define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
 
 static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -309,6 +299,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 	int res, val = 0, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
+	if (dentry == dentry->d_sb->s_root)
+		return 1;
+
 	if (nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
@@ -637,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 		entry->ino = iunique(dir->i_sb, 2);
 		inode = ncp_iget(dir->i_sb, entry);
 		if (inode) {
-			d_set_d_op(newdent, &ncp_dentry_operations);
 			d_instantiate(newdent, inode);
 			if (!hashed)
 				d_rehash(newdent);
@@ -893,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 	if (inode) {
 		ncp_new_dentry(dentry);
 add_entry:
-		d_set_d_op(dentry, &ncp_dentry_operations);
 		d_add(dentry, inode);
 		error = 0;
 	}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index cb50aaf981d..0ed65e0c3df 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,8 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 static int ncp_fsync(struct file *file, int datasync)
 {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9b39a5dd413..00a1d1c3d3a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -31,11 +31,9 @@
 #include <linux/seq_file.h>
 #include <linux/namei.h>
 
-#include <linux/ncp_fs.h>
-
 #include <net/sock.h>
 
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 #include "getopt.h"
 
 #define NCP_DEFAULT_FILE_MODE 0600
@@ -544,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_blocksize_bits = 10;
 	sb->s_magic = NCP_SUPER_MAGIC;
 	sb->s_op = &ncp_sops;
+	sb->s_d_op = &ncp_dentry_operations;
 	sb->s_bdi = &server->bdi;
 
 	server = NCP_SBP(sb);
@@ -723,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_root = d_alloc_root(root_inode);
         if (!sb->s_root)
 		goto out_no_root;
-	d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
 	return 0;
 
 out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d40a547e337..790e92a9ec6 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,11 +20,9 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 
-#include <linux/ncp_fs.h>
-
 #include <asm/uaccess.h>
 
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 /* maximum limit for ncp_objectname_ioctl */
 #define NCP_OBJECT_NAME_MAX_LEN	4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1e..a7c07b44b10 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
-#include <linux/ncp_fs.h>
 
-#include "ncplib_kernel.h"
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#include "ncp_fs.h"
+
 /*
  * Fill in the supplied page for mmap
  * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 00000000000..31831afe1c3
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+
+
+struct ncp_entry_info {
+	struct nw_info_struct	i;
+	ino_t			ino;
+	int			opened;
+	int			access;
+	unsigned int		volume;
+	__u8			file_handle[6];
+};
+
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+#define NCP_SERVER(inode)	NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+	return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+	void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+	return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 00000000000..4b0bec47784
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+	__le32	dirEntNum;
+	__le32	DosDirNum;
+	__u8	volNumber;
+	__le32	nwattr;
+	struct mutex open_mutex;
+	atomic_t	opened;
+	int	access;
+	int	flags;
+#define NCPI_KLUDGE_SYMLINK	0x0001
+	__u8	file_handle[6];
+	struct inode vfs_inode;
+};
+
+#endif	/* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 00000000000..4af803f1351
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+
+#define NCP_DEFAULT_OPTIONS 0		/* 2 for packet signatures */
+
+struct sock;
+
+struct ncp_mount_data_kernel {
+	unsigned long    flags;		/* NCP_MOUNT_* flags */
+	unsigned int	 int_flags;	/* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001
+	__kernel_uid32_t mounted_uid;	/* Who may umount() this filesystem? */
+	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */
+	unsigned int     ncp_fd;	/* The socket to the ncp port */
+	unsigned int     time_out;	/* How long should I wait after
+					   sending a NCP request? */
+	unsigned int     retry_count;	/* And how often should I retry? */
+	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1];
+	__kernel_uid32_t uid;
+	__kernel_gid32_t gid;
+	__kernel_mode_t  file_mode;
+	__kernel_mode_t  dir_mode;
+	int		 info_fd;
+};
+
+struct ncp_server {
+
+	struct ncp_mount_data_kernel m;	/* Nearly all of the mount data is of
+					   interest for us later, so we store
+					   it completely. */
+
+	__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+
+	struct file *ncp_filp;	/* File pointer to ncp socket */
+	struct socket *ncp_sock;/* ncp socket */
+	struct file *info_filp;
+	struct socket *info_sock;
+
+	u8 sequence;
+	u8 task;
+	u16 connection;		/* Remote connection number */
+
+	u8 completion;		/* Status message from server */
+	u8 conn_status;		/* Bit 4 = 1 ==> Server going down, no
+				   requests allowed anymore.
+				   Bit 0 = 1 ==> Server is down. */
+
+	int buffer_size;	/* Negotiated bufsize */
+
+	int reply_size;		/* Size of last reply */
+
+	int packet_size;
+	unsigned char *packet;	/* Here we prepare requests and
+				   receive replies */
+	unsigned char *txbuf;	/* Storage for current request */
+	unsigned char *rxbuf;	/* Storage for reply to current request */
+
+	int lock;		/* To prevent mismatch in protocols. */
+	struct mutex mutex;
+
+	int current_size;	/* for packet preparation */
+	int has_subfunction;
+	int ncp_reply_size;
+
+	int root_setuped;
+	struct mutex root_setup_lock;
+
+	/* info for packet signing */
+	int sign_wanted;	/* 1=Server needs signed packets */
+	int sign_active;	/* 0=don't do signing, 1=do */
+	char sign_root[8];	/* generated from password and encr. key */
+	char sign_last[16];	
+
+	/* Authentication info: NDS or BINDERY, username */
+	struct {
+		int	auth_type;
+		size_t	object_name_len;
+		void*	object_name;
+		int	object_type;
+	} auth;
+	/* Password info */
+	struct {
+		size_t	len;
+		void*	data;
+	} priv;
+	struct rw_semaphore auth_rwsem;
+
+	/* nls info: codepage for volume and charset for I/O */
+	struct nls_table *nls_vol;
+	struct nls_table *nls_io;
+
+	/* maximum age in jiffies */
+	atomic_t dentry_ttl;
+
+	/* miscellaneous */
+	unsigned int flags;
+
+	spinlock_t requests_lock;	/* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+
+	void (*data_ready)(struct sock* sk, int len);
+	void (*error_report)(struct sock* sk);
+	void (*write_space)(struct sock* sk);	/* STREAM mode only */
+	struct {
+		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
+		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
+		struct mutex creq_mutex;	/* DGRAM only: lock accesses to rcv.creq */
+
+		unsigned int state;		/* STREAM only: receiver state */
+		struct {
+			__u32 magic __packed;
+			__u32 len __packed;
+			__u16 type __packed;
+			__u16 p1 __packed;
+			__u16 p2 __packed;
+			__u16 p3 __packed;
+			__u16 type2 __packed;
+		} buf;				/* STREAM only: temporary buffer */
+		unsigned char* ptr;		/* STREAM only: pointer to data */
+		size_t len;			/* STREAM only: length of data to receive */
+	} rcv;
+	struct {
+		struct list_head requests;	/* STREAM only: queued requests */
+		struct work_struct tq;		/* STREAM only: transmitter ready */
+		struct ncp_request_reply* creq;	/* STREAM only: currently transmitted entry */
+	} tx;
+	struct timer_list timeout_tm;		/* DGRAM only: timeout timer */
+	struct work_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
+	int timeout_last;			/* DGRAM only: current timeout length */
+	int timeout_retries;			/* DGRAM only: retries left */
+	struct {
+		size_t len;
+		__u8 data[128];
+	} unexpected_packet;
+	struct backing_dev_info bdi;
+};
+
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+
+#define NCP_FLAG_UTF8	1
+
+#define NCP_CLR_FLAG(server, flag)	((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)	((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)	((server)->flags & (flag))
+
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+	return ((server->conn_status & 0x11) == 0);
+}
+
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+	server->conn_status |= 0x01;
+}
+
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6a..981a95617fc 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
 
 
 
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 
 static inline void assert_server_locked(struct ncp_server *server)
 {
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 1220df75ff2..09881e6aa5a 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
 #include <linux/ctype.h>
 #endif /* CONFIG_NCPFS_NLS */
 
-#include <linux/ncp_fs.h>
-
 #define NCP_MIN_SYMLINK_SIZE	8
 #define NCP_MAX_SYMLINK_SIZE	512
 
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910..08907599dcd 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <linux/ncp.h>
 #include <linux/bitops.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
 
 /* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381c..d9a1438bb1f 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
 #ifndef _NCPSIGN_KERNEL_H
 #define _NCPSIGN_KERNEL_H
 
-#include <linux/ncp_fs.h>
-
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
 int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346..3a1587222c8 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
 #include <linux/poll.h>
 #include <linux/file.h>
 
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
 
 #include "ncpsign_kernel.h"
 
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b33..661f861d80c 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/ncp_fs.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include "ncplib_kernel.h"
-
+#include "ncp_fs.h"
 
 /* these magic numbers must appear in the symlink file -- this makes it a bit
    more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 93a8b3bd69e..199016528fc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 
 #include <net/inet_sock.h>
 
@@ -137,6 +135,33 @@ out_err:
 
 #if defined(CONFIG_NFS_V4_1)
 /*
+ *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
+ *   */
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+	struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
+	struct nfs4_sessionid *bc_sid;
+
+	if (!serv->sv_bc_xprt)
+		return -EINVAL;
+
+	/* on success freed in xprt_free */
+	bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
+	if (!bc_sid)
+		return -ENOMEM;
+	memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
+		NFS4_MAX_SESSIONID_LEN);
+	spin_lock_bh(&serv->sv_cb_lock);
+	serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
+	spin_unlock_bh(&serv->sv_cb_lock);
+	dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
+		((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
+		((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
+		serv->sv_bc_xprt);
+	return 0;
+}
+
+/*
  * The callback service for NFSv4.1 callbacks
  */
 static int
@@ -177,30 +202,38 @@ nfs41_callback_svc(void *vrqstp)
 struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
-	struct svc_xprt *bc_xprt;
-	struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+	struct svc_rqst *rqstp;
+	int ret;
 
-	dprintk("--> %s\n", __func__);
-	/* Create a svc_sock for the service */
-	bc_xprt = svc_sock_create(serv, xprt->prot);
-	if (!bc_xprt)
+	/*
+	 * Create an svc_sock for the back channel service that shares the
+	 * fore channel connection.
+	 * Returns the input port (0) and sets the svc_serv bc_xprt on success
+	 */
+	ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+			      SVC_SOCK_ANONYMOUS);
+	if (ret < 0) {
+		rqstp = ERR_PTR(ret);
 		goto out;
+	}
 
 	/*
 	 * Save the svc_serv in the transport so that it can
 	 * be referenced when the session backchannel is initialized
 	 */
-	serv->bc_xprt = bc_xprt;
 	xprt->bc_serv = serv;
 
 	INIT_LIST_HEAD(&serv->sv_cb_list);
 	spin_lock_init(&serv->sv_cb_lock);
 	init_waitqueue_head(&serv->sv_cb_waitq);
 	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-	if (IS_ERR(rqstp))
-		svc_sock_destroy(bc_xprt);
+	if (IS_ERR(rqstp)) {
+		svc_xprt_put(serv->sv_bc_xprt);
+		serv->sv_bc_xprt = NULL;
+	}
 out:
-	dprintk("--> %s return %p\n", __func__, rqstp);
+	dprintk("--> %s return %ld\n", __func__,
+		IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
 	return rqstp;
 }
 
@@ -233,6 +266,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct nfs_callback_data *cb_info)
 {
 }
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -328,6 +365,9 @@ static int check_gss_callback_principal(struct nfs_client *clp,
 	struct rpc_clnt *r = clp->cl_rpcclient;
 	char *p = svc_gss_principal(rqstp);
 
+	/* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+	if (clp->cl_minorversion != 0)
+		return SVC_DROP;
 	/*
 	 * It might just be a normal user principal, in which case
 	 * userspace won't bother to tell us the name at all.
@@ -345,6 +385,23 @@ static int check_gss_callback_principal(struct nfs_client *clp,
 	return SVC_OK;
 }
 
+/* pg_authenticate method helper */
+static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
+{
+	struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
+	int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
+
+	dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
+	if (svc_is_backchannel(rqstp))
+		/* Sessionid (usually) set after CB_NULL ping */
+		return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
+						  is_cb_compound);
+	else
+		/* No callback identifier in pg_authenticate */
+		return nfs4_find_client_no_ident(svc_addr(rqstp));
+}
+
+/* pg_authenticate method for nfsv4 callback threads. */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
 	struct nfs_client *clp;
@@ -352,7 +409,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 	int ret = SVC_OK;
 
 	/* Don't talk to strangers */
-	clp = nfs_find_client(svc_addr(rqstp), 4);
+	clp = nfs_cb_find_client(rqstp);
 	if (clp == NULL)
 		return SVC_DROP;
 
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8d..d3b44f9bd74 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,10 +34,17 @@ enum nfs4_callback_opnum {
 	OP_CB_ILLEGAL = 10044,
 };
 
+struct cb_process_state {
+	__be32			drc_status;
+	struct nfs_client	*clp;
+	struct nfs4_sessionid	*svc_sid; /* v4.1 callback service sessionid */
+};
+
 struct cb_compound_hdr_arg {
 	unsigned int taglen;
 	const char *tag;
 	unsigned int minorversion;
+	unsigned int cb_ident; /* v4.0 callback identifier */
 	unsigned nops;
 };
 
@@ -103,14 +110,23 @@ struct cb_sequenceres {
 	uint32_t			csr_target_highestslotid;
 };
 
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
-				       struct cb_sequenceres *res);
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+				       struct cb_sequenceres *res,
+				       struct cb_process_state *cps);
 
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
 					     const nfs4_stateid *stateid);
 
 #define RCA4_TYPE_MASK_RDATA_DLG	0
 #define RCA4_TYPE_MASK_WDATA_DLG	1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
 
 struct cb_recallanyargs {
 	struct sockaddr	*craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
 	uint32_t	craa_type_mask;
 };
 
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+					void *dummy,
+					struct cb_process_state *cps);
 
 struct cb_recallslotargs {
 	struct sockaddr	*crsa_addr;
 	uint32_t	crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
-					  void *dummy);
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
+					 void *dummy,
+					 struct cb_process_state *cps);
+
+struct cb_layoutrecallargs {
+	struct sockaddr		*cbl_addr;
+	uint32_t		cbl_recall_type;
+	uint32_t		cbl_layout_type;
+	uint32_t		cbl_layoutchanged;
+	union {
+		struct {
+			struct nfs_fh		cbl_fh;
+			struct pnfs_layout_range cbl_range;
+			nfs4_stateid		cbl_stateid;
+		};
+		struct nfs_fsid		cbl_fsid;
+	};
+};
 
-#endif /* CONFIG_NFS_V4_1 */
+extern unsigned nfs4_callback_layoutrecall(
+	struct cb_layoutrecallargs *args,
+	void *dummy, struct cb_process_state *cps);
 
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
 
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+				    struct cb_getattrres *res,
+				    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+				   struct cb_process_state *cps);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
 					    const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
  * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61..4bb91cb2620 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+			     struct cb_getattrres *res,
+			     struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	struct nfs_delegation *delegation;
 	struct nfs_inode *nfsi;
 	struct inode *inode;
 
+	res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+		goto out;
+
 	res->bitmap[0] = res->bitmap[1] = 0;
 	res->status = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client(args->addr, 4);
-	if (clp == NULL)
-		goto out;
 
 	dprintk("NFS: GETATTR callback request from %s\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
-	inode = nfs_delegation_find_inode(clp, &args->fh);
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
 	if (inode == NULL)
-		goto out_putclient;
+		goto out;
 	nfsi = NFS_I(inode);
 	rcu_read_lock();
 	delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
 	rcu_read_unlock();
 	iput(inode);
-out_putclient:
-	nfs_put_client(clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
 	return res->status;
 }
 
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+			    struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	struct inode *inode;
 	__be32 res;
 	
-	res = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client(args->addr, 4);
-	if (clp == NULL)
+	res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
 		goto out;
 
 	dprintk("NFS: RECALL callback request from %s\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-
-	do {
-		struct nfs_client *prev = clp;
-
-		inode = nfs_delegation_find_inode(clp, &args->fh);
-		if (inode != NULL) {
-			/* Set up a helper thread to actually return the delegation */
-			switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-				case 0:
-					res = 0;
-					break;
-				case -ENOENT:
-					if (res != 0)
-						res = htonl(NFS4ERR_BAD_STATEID);
-					break;
-				default:
-					res = htonl(NFS4ERR_RESOURCE);
-			}
-			iput(inode);
-		}
-		clp = nfs_find_client_next(prev);
-		nfs_put_client(prev);
-	} while (clp != NULL);
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	res = htonl(NFS4ERR_BADHANDLE);
+	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+	if (inode == NULL)
+		goto out;
+	/* Set up a helper thread to actually return the delegation */
+	switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+	case 0:
+		res = 0;
+		break;
+	case -ENOENT:
+		if (res != 0)
+			res = htonl(NFS4ERR_BAD_STATEID);
+		break;
+	default:
+		res = htonl(NFS4ERR_RESOURCE);
+	}
+	iput(inode);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
 	return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 
 #if defined(CONFIG_NFS_V4_1)
 
+static u32 initiate_file_draining(struct nfs_client *clp,
+				  struct cb_layoutrecallargs *args)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *ino;
+	bool found = false;
+	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+	LIST_HEAD(free_me_list);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+		if (nfs_compare_fh(&args->cbl_fh,
+				   &NFS_I(lo->plh_inode)->fh))
+			continue;
+		ino = igrab(lo->plh_inode);
+		if (!ino)
+			continue;
+		found = true;
+		/* Without this, layout can be freed as soon
+		 * as we release cl_lock.
+		 */
+		get_layout_hdr(lo);
+		break;
+	}
+	spin_unlock(&clp->cl_lock);
+	if (!found)
+		return NFS4ERR_NOMATCHING_LAYOUT;
+
+	spin_lock(&ino->i_lock);
+	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+	    mark_matching_lsegs_invalid(lo, &free_me_list,
+					args->cbl_range.iomode))
+		rv = NFS4ERR_DELAY;
+	else
+		rv = NFS4ERR_NOMATCHING_LAYOUT;
+	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&free_me_list);
+	put_layout_hdr(lo);
+	iput(ino);
+	return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+				  struct cb_layoutrecallargs *args)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *ino;
+	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+	struct pnfs_layout_hdr *tmp;
+	LIST_HEAD(recall_list);
+	LIST_HEAD(free_me_list);
+	struct pnfs_layout_range range = {
+		.iomode = IOMODE_ANY,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+		if ((args->cbl_recall_type == RETURN_FSID) &&
+		    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+			   &args->cbl_fsid, sizeof(struct nfs_fsid)))
+			continue;
+		if (!igrab(lo->plh_inode))
+			continue;
+		get_layout_hdr(lo);
+		BUG_ON(!list_empty(&lo->plh_bulk_recall));
+		list_add(&lo->plh_bulk_recall, &recall_list);
+	}
+	spin_unlock(&clp->cl_lock);
+	list_for_each_entry_safe(lo, tmp,
+				 &recall_list, plh_bulk_recall) {
+		ino = lo->plh_inode;
+		spin_lock(&ino->i_lock);
+		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+			rv = NFS4ERR_DELAY;
+		list_del_init(&lo->plh_bulk_recall);
+		spin_unlock(&ino->i_lock);
+		put_layout_hdr(lo);
+		iput(ino);
+	}
+	pnfs_free_lseg_list(&free_me_list);
+	return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+				    struct cb_layoutrecallargs *args)
+{
+	u32 res = NFS4ERR_DELAY;
+
+	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+	if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+		goto out;
+	if (args->cbl_recall_type == RETURN_FILE)
+		res = initiate_file_draining(clp, args);
+	else
+		res = initiate_bulk_draining(clp, args);
+	clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+	dprintk("%s returning %i\n", __func__, res);
+	return res;
+
+}
+
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+				  void *dummy, struct cb_process_state *cps)
+{
+	u32 res;
+
+	dprintk("%s: -->\n", __func__);
+
+	if (cps->clp)
+		res = do_callback_layoutrecall(cps->clp, args);
+	else
+		res = NFS4ERR_OP_NOT_IN_SESSION;
+
+	dprintk("%s: exit with status = %d\n", __func__, res);
+	return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+	struct cb_layoutrecallargs args;
+
+	/* Pretend we got a CB_LAYOUTRECALL(ALL) */
+	memset(&args, 0, sizeof(args));
+	args.cbl_recall_type = RETURN_ALL;
+	/* FIXME we ignore errors, what should we do? */
+	do_callback_layoutrecall(clp, &args);
+}
+
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
 	if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 }
 
 /*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-	const struct sockaddr *addr, u32 nfsversion,
-	struct nfs4_sessionid *sessionid)
-{
-	struct nfs_client *clp;
-
-	clp = nfs_find_client(addr, 4);
-	if (clp == NULL)
-		return NULL;
-
-	do {
-		struct nfs_client *prev = clp;
-
-		if (clp->cl_session != NULL) {
-			if (memcmp(clp->cl_session->sess_id.data,
-					sessionid->data,
-					NFS4_MAX_SESSIONID_LEN) == 0) {
-				/* Returns a held reference to clp */
-				return clp;
-			}
-		}
-		clp = nfs_find_client_next(prev);
-		nfs_put_client(prev);
-	} while (clp != NULL);
-
-	return NULL;
-}
-
-/*
  * For each referring call triple, check the session's slot table for
  * a match.  If the slot is in use and the sequence numbers match, the
  * client is still waiting for a response to the original request.
@@ -276,20 +368,34 @@ out:
 }
 
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-				struct cb_sequenceres *res)
+			      struct cb_sequenceres *res,
+			      struct cb_process_state *cps)
 {
 	struct nfs_client *clp;
 	int i;
 	__be32 status;
 
+	cps->clp = NULL;
+
 	status = htonl(NFS4ERR_BADSESSION);
-	clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+	/* Incoming session must match the callback session */
+	if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
+		goto out;
+
+	clp = nfs4_find_client_sessionid(args->csa_addr,
+					 &args->csa_sessionid, 1);
 	if (clp == NULL)
 		goto out;
 
+	/* state manager is resetting the session */
+	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+		status = NFS4ERR_DELAY;
+		goto out;
+	}
+
 	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
 	if (status)
-		goto out_putclient;
+		goto out;
 
 	/*
 	 * Check for pending referring calls.  If a match is found, a
@@ -298,7 +404,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	 */
 	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
 		status = htonl(NFS4ERR_DELAY);
-		goto out_putclient;
+		goto out;
 	}
 
 	memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +413,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	res->csr_slotid = args->csa_slotid;
 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	nfs4_cb_take_slot(clp);
+	cps->clp = clp; /* put in nfs4_callback_compound */
 
-out_putclient:
-	nfs_put_client(clp);
 out:
 	for (i = 0; i < args->csa_nrclists; i++)
 		kfree(args->csa_rclists[i].rcl_refcalls);
 	kfree(args->csa_rclists);
 
-	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
-		res->csr_status = 0;
-	else
+	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+		cps->drc_status = status;
+		status = 0;
+	} else
 		res->csr_status = status;
+
 	dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
 		ntohl(status), ntohl(res->csr_status));
 	return status;
 }
 
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+	return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+			       struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	__be32 status;
 	fmode_t flags = 0;
 
-	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-	clp = nfs_find_client(args->craa_addr, 4);
-	if (clp == NULL)
+	status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+	if (!cps->clp) /* set in cb_sequence */
 		goto out;
 
 	dprintk("NFS: RECALL_ANY callback request from %s\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	status = cpu_to_be32(NFS4ERR_INVAL);
+	if (!validate_bitmap_values(args->craa_type_mask))
+		goto out;
 
+	status = cpu_to_be32(NFS4_OK);
 	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
 		     &args->craa_type_mask))
 		flags = FMODE_READ;
 	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
 		     &args->craa_type_mask))
 		flags |= FMODE_WRITE;
-
+	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+		     &args->craa_type_mask))
+		pnfs_recall_all_layouts(cps->clp);
 	if (flags)
-		nfs_expire_all_delegation_types(clp, flags);
-	status = htonl(NFS4_OK);
+		nfs_expire_all_delegation_types(cps->clp, flags);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+				struct cb_process_state *cps)
 {
-	struct nfs_client *clp;
 	struct nfs4_slot_table *fc_tbl;
 	__be32 status;
 
 	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-	clp = nfs_find_client(args->crsa_addr, 4);
-	if (clp == NULL)
+	if (!cps->clp) /* set in cb_sequence */
 		goto out;
 
 	dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
 		args->crsa_target_max_slots);
 
-	fc_tbl = &clp->cl_session->fc_slot_table;
+	fc_tbl = &cps->clp->cl_session->fc_slot_table;
 
 	status = htonl(NFS4ERR_BAD_HIGH_SLOT);
 	if (args->crsa_target_max_slots > fc_tbl->max_slots ||
 	    args->crsa_target_max_slots < 1)
-		goto out_putclient;
+		goto out;
 
 	status = htonl(NFS4_OK);
 	if (args->crsa_target_max_slots == fc_tbl->max_slots)
-		goto out_putclient;
+		goto out;
 
 	fc_tbl->target_max_slots = args->crsa_target_max_slots;
-	nfs41_handle_recall_slot(clp);
-out_putclient:
-	nfs_put_client(clp);	/* balance nfs_find_client */
+	nfs41_handle_recall_slot(cps->clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0ed..23112c263f8 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 
 #define CB_OP_TAGLEN_MAXSZ	(512)
 #define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
 #define CB_OP_RECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 
 #if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
 					4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR	11050
 
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+					struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
 
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	hdr->minorversion = ntohl(*p++);
 	/* Check minor version is zero or one. */
 	if (hdr->minorversion <= 1) {
-		p++;	/* skip callback_ident */
+		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
 	} else {
 		printk(KERN_WARNING "%s: NFSv4 server callback with "
 			"illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
 
 #if defined(CONFIG_NFS_V4_1)
 
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct cb_layoutrecallargs *args)
+{
+	__be32 *p;
+	__be32 status = 0;
+	uint32_t iomode;
+
+	args->cbl_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4 * sizeof(uint32_t));
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+
+	args->cbl_layout_type = ntohl(*p++);
+	/* Depite the spec's xdr, iomode really belongs in the FILE switch,
+	 * as it is unuseable and ignored with the other types.
+	 */
+	iomode = ntohl(*p++);
+	args->cbl_layoutchanged = ntohl(*p++);
+	args->cbl_recall_type = ntohl(*p++);
+
+	if (args->cbl_recall_type == RETURN_FILE) {
+		args->cbl_range.iomode = iomode;
+		status = decode_fh(xdr, &args->cbl_fh);
+		if (unlikely(status != 0))
+			goto out;
+
+		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto out;
+		}
+		p = xdr_decode_hyper(p, &args->cbl_range.offset);
+		p = xdr_decode_hyper(p, &args->cbl_range.length);
+		status = decode_stateid(xdr, &args->cbl_stateid);
+		if (unlikely(status != 0))
+			goto out;
+	} else if (args->cbl_recall_type == RETURN_FSID) {
+		p = read_buf(xdr, 2 * sizeof(uint64_t));
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto out;
+		}
+		p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+		p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+	} else if (args->cbl_recall_type != RETURN_ALL) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+	dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+		__func__,
+		args->cbl_layout_type, iomode,
+		args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
 static __be32 decode_sessionid(struct xdr_stream *xdr,
 				 struct nfs4_sessionid *sid)
 {
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_SEQUENCE:
 	case OP_CB_RECALL_ANY:
 	case OP_CB_RECALL_SLOT:
+	case OP_CB_LAYOUTRECALL:
 		*op = &callback_ops[op_nr];
 		break;
 
-	case OP_CB_LAYOUTRECALL:
 	case OP_CB_NOTIFY_DEVICEID:
 	case OP_CB_NOTIFY:
 	case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	return htonl(NFS_OK);
 }
 
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+	struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	/*
+	 * Let the state manager know callback processing done.
+	 * A single slot, so highest used slotid is either 0 or -1
+	 */
+	tbl->highest_used_slotid--;
+	nfs4_check_drain_bc_complete(session);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+	if (clp && clp->cl_session)
+		nfs4_callback_free_slot(clp->cl_session);
+}
+
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+	struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
+	tbl->highest_used_slotid++;
+	BUG_ON(tbl->highest_used_slotid != 0);
+	spin_unlock(&tbl->slot_tbl_lock);
+}
+
 #else /* CONFIG_NFS_V4_1 */
 
 static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
 
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
 		struct svc_rqst *rqstp,
 		struct xdr_stream *xdr_in, void *argp,
-		struct xdr_stream *xdr_out, void *resp, int* drc_status)
+		struct xdr_stream *xdr_out, void *resp,
+		struct cb_process_state *cps)
 {
 	struct callback_op *op = &callback_ops[0];
 	unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
 	if (status)
 		goto encode_hdr;
 
-	if (*drc_status) {
-		status = *drc_status;
+	if (cps->drc_status) {
+		status = cps->drc_status;
 		goto encode_hdr;
 	}
 
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
 	if (maxlen > 0 && maxlen < PAGE_SIZE) {
 		status = op->decode_args(rqstp, xdr_in, argp);
 		if (likely(status == 0))
-			status = op->process_op(argp, resp);
+			status = op->process_op(argp, resp, cps);
 	} else
 		status = htonl(NFS4ERR_RESOURCE);
 
-	/* Only set by OP_CB_SEQUENCE processing */
-	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-		*drc_status = status;
-		status = 0;
-	}
-
 encode_hdr:
 	res = encode_op_hdr(xdr_out, op_nr, status);
 	if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	struct cb_compound_hdr_arg hdr_arg = { 0 };
 	struct cb_compound_hdr_res hdr_res = { NULL };
 	struct xdr_stream xdr_in, xdr_out;
-	__be32 *p;
-	__be32 status, drc_status = 0;
+	__be32 *p, status;
+	struct cb_process_state cps = {
+		.drc_status = 0,
+		.clp = NULL,
+	};
 	unsigned int nops = 0;
 
 	dprintk("%s: start\n", __func__);
@@ -696,6 +792,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	if (status == __constant_htonl(NFS4ERR_RESOURCE))
 		return rpc_garbage_args;
 
+	if (hdr_arg.minorversion == 0) {
+		cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+		if (!cps.clp)
+			return rpc_drop_reply;
+	} else
+		cps.svc_sid = bc_xprt_sid(rqstp);
+
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
 	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +806,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	while (status == 0 && nops != hdr_arg.nops) {
 		status = process_op(hdr_arg.minorversion, nops, rqstp,
-				    &xdr_in, argp, &xdr_out, resp, &drc_status);
+				    &xdr_in, argp, &xdr_out, resp, &cps);
 		nops++;
 	}
 
@@ -716,6 +819,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
+	nfs4_cb_free_slot(cps.clp);
+	nfs_put_client(cps.clp);
 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
 }
@@ -739,6 +844,12 @@ static struct callback_op callback_ops[] = {
 		.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
 	},
 #if defined(CONFIG_NFS_V4_1)
+	[OP_CB_LAYOUTRECALL] = {
+		.process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+		.decode_args =
+			(callback_decode_arg_t)decode_layoutrecall_args,
+		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+	},
 	[OP_CB_SEQUENCE] = {
 		.process_op = (callback_process_op_t)nfs4_callback_sequence,
 		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc..192f2f86026 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+	int ret = 0;
+
+	if (clp->rpc_ops->version != 4 || minorversion != 0)
+		return ret;
+retry:
+	if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+		return -ENOMEM;
+	spin_lock(&nfs_client_lock);
+	ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+	spin_unlock(&nfs_client_lock);
+	if (ret == -EAGAIN)
+		goto retry;
+	return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 
 /*
  * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_proto = cl_init->proto;
 
 #ifdef CONFIG_NFS_V4
-	INIT_LIST_HEAD(&clp->cl_delegations);
+	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+	if (err)
+		goto error_cleanup;
+
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
 }
 
 #ifdef CONFIG_NFS_V4
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4_1
-	if (nfs4_has_session(clp)) {
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+	if (nfs4_has_session(clp))
 		nfs4_destroy_session(clp->cl_session);
-		clp->cl_session = NULL;
-	}
-
-	clp->cl_mvops = nfs_v4_minor_ops[0];
-#endif /* CONFIG_NFS_V4_1 */
 }
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
 
 /*
  * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
 		nfs4_kill_renewd(clp);
-	nfs4_clear_client_minor_version(clp);
+	nfs4_shutdown_session(clp);
 	nfs4_destroy_callback(clp);
 	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
 		nfs_idmap_delete(clp);
 
 	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+	idr_destroy(&cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+	if (clp->cl_cb_ident)
+		idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
 #endif /* CONFIG_NFS_V4 */
 
 /*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
 
 	if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
 		list_del(&clp->cl_share_link);
+		nfs_cb_idr_remove_locked(clp);
 		spin_unlock(&nfs_client_lock);
 
 		BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
 	return 0;
 }
 
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
-{
-	struct nfs_client *clp;
-
-	spin_lock(&nfs_client_lock);
-	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-
-		/* Don't match clients that failed to initialise properly */
-		if (!(clp->cl_cons_state == NFS_CS_READY ||
-		      clp->cl_cons_state == NFS_CS_SESSION_INITING))
-			continue;
-
-		/* Different NFS versions cannot share the same nfs_client */
-		if (clp->rpc_ops->version != nfsversion)
-			continue;
-
-		/* Match only the IP address, not the port number */
-		if (!nfs_sockaddr_match_ipaddr(addr, clap))
-			continue;
-
-		atomic_inc(&clp->cl_count);
-		spin_unlock(&nfs_client_lock);
-		return clp;
-	}
-	spin_unlock(&nfs_client_lock);
-	return NULL;
-}
-
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
+/* Common match routine for v4.0 and v4.1 callback services */
+bool
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
+		     u32 minorversion)
 {
-	struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
-	u32 nfsvers = clp->rpc_ops->version;
+	struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 
-	spin_lock(&nfs_client_lock);
-	list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
-		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+	/* Don't match clients that failed to initialise */
+	if (!(clp->cl_cons_state == NFS_CS_READY ||
+	    clp->cl_cons_state == NFS_CS_SESSION_INITING))
+		return false;
 
-		/* Don't match clients that failed to initialise properly */
-		if (clp->cl_cons_state != NFS_CS_READY)
-			continue;
+	/* Match the version and minorversion */
+	if (clp->rpc_ops->version != 4 ||
+	    clp->cl_minorversion != minorversion)
+		return false;
 
-		/* Different NFS versions cannot share the same nfs_client */
-		if (clp->rpc_ops->version != nfsvers)
-			continue;
+	/* Match only the IP address, not the port number */
+	if (!nfs_sockaddr_match_ipaddr(addr, clap))
+		return false;
 
-		/* Match only the IP address, not the port number */
-		if (!nfs_sockaddr_match_ipaddr(sap, clap))
-			continue;
-
-		atomic_inc(&clp->cl_count);
-		spin_unlock(&nfs_client_lock);
-		return clp;
-	}
-	spin_unlock(&nfs_client_lock);
-	return NULL;
+	return true;
 }
 
 /*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
 	target->options = source->options;
 }
 
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	spin_unlock(&nfs_client_lock);
+
+}
+
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+	spin_lock(&nfs_client_lock);
+	list_del_rcu(&server->client_link);
+	list_del(&server->master_link);
+	spin_unlock(&nfs_client_lock);
+
+	synchronize_rcu();
+}
+
 /*
  * Allocate and initialise a server record
  */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
 	/* Zero out the NFS state stuff */
 	INIT_LIST_HEAD(&server->client_link);
 	INIT_LIST_HEAD(&server->master_link);
+	INIT_LIST_HEAD(&server->delegations);
 
 	atomic_set(&server->active, 0);
 
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
 		return NULL;
 	}
 
+	pnfs_init_server(server);
+
 	return server;
 }
 
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
 {
 	dprintk("--> nfs_free_server()\n");
 
+	nfs_server_remove_lists(server);
 	unset_pnfs_layoutdriver(server);
-	spin_lock(&nfs_client_lock);
-	list_del(&server->client_link);
-	list_del(&server->master_link);
-	spin_unlock(&nfs_client_lock);
 
 	if (server->destroy != NULL)
 		server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
 
-	spin_lock(&nfs_client_lock);
-	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-	list_add_tail(&server->master_link, &nfs_volume_list);
-	spin_unlock(&nfs_client_lock);
-
+	nfs_server_insert_lists(server);
 	server->mount_time = jiffies;
 	nfs_free_fattr(fattr);
 	return server;
@@ -1125,6 +1156,101 @@ error:
 
 #ifdef CONFIG_NFS_V4
 /*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, 0) == false)
+			continue;
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	clp = idr_find(&cb_ident_idr, cb_ident);
+	if (clp)
+		atomic_inc(&clp->cl_count);
+	spin_unlock(&nfs_client_lock);
+	return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
+ * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
+ * can arrive before the callback sessionid is set. For CB_NULL calls,
+ * find a client by IP address protocol version, and minorversion.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid, int is_cb_compound)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		if (nfs4_cb_match_client(addr, clp, 1) == false)
+			continue;
+
+		if (!nfs4_has_session(clp))
+			continue;
+
+		/* Match sessionid unless cb_null call*/
+		if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
+		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+			   struct nfs4_sessionid *sid, int is_cb_compound)
+{
+	return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
  * Initialize the NFS4 callback service
  */
 static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1468,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
 		server->namelen = NFS4_MAXNAMLEN;
 
-	spin_lock(&nfs_client_lock);
-	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-	list_add_tail(&server->master_link, &nfs_volume_list);
-	spin_unlock(&nfs_client_lock);
-
+	nfs_server_insert_lists(server);
 	server->mount_time = jiffies;
 out:
 	nfs_free_fattr(fattr);
@@ -1551,11 +1673,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 	if (error < 0)
 		goto out_free_server;
 
-	spin_lock(&nfs_client_lock);
-	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-	list_add_tail(&server->master_link, &nfs_volume_list);
-	spin_unlock(&nfs_client_lock);
-
+	nfs_server_insert_lists(server);
 	server->mount_time = jiffies;
 
 	nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1fd62fc49be..364e4328f39 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
 	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
 
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
 	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
 	struct nfs_delegation *delegation;
@@ -119,10 +131,15 @@ again:
 	return 0;
 }
 
-/*
- * Set up a delegation on an inode
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
  */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+				  struct nfs_openres *res)
 {
 	struct nfs_delegation *delegation;
 	struct rpc_cred *oldcred = NULL;
@@ -175,38 +192,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
 	return inode;
 }
 
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-							   const nfs4_stateid *stateid,
-							   struct nfs_client *clp)
+static struct nfs_delegation *
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+			     struct nfs_server *server)
 {
 	struct nfs_delegation *delegation =
 		rcu_dereference_protected(nfsi->delegation,
-					  lockdep_is_held(&clp->cl_lock));
+				lockdep_is_held(&server->nfs_client->cl_lock));
 
 	if (delegation == NULL)
 		goto nomatch;
+
 	spin_lock(&delegation->lock);
-	if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-				sizeof(delegation->stateid.data)) != 0)
-		goto nomatch_unlock;
 	list_del_rcu(&delegation->super_list);
 	delegation->inode = NULL;
 	nfsi->delegation_state = 0;
 	rcu_assign_pointer(nfsi->delegation, NULL);
 	spin_unlock(&delegation->lock);
 	return delegation;
-nomatch_unlock:
-	spin_unlock(&delegation->lock);
 nomatch:
 	return NULL;
 }
 
-/*
- * Set up a delegation on an inode
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
+						    struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs_delegation *delegation;
+
+	spin_lock(&clp->cl_lock);
+	delegation = nfs_detach_delegation_locked(nfsi, server);
+	spin_unlock(&clp->cl_lock);
+	return delegation;
+}
+
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
  */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation, *old_delegation;
 	struct nfs_delegation *freeme = NULL;
@@ -227,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 
 	spin_lock(&clp->cl_lock);
 	old_delegation = rcu_dereference_protected(nfsi->delegation,
-						   lockdep_is_held(&clp->cl_lock));
+					lockdep_is_held(&clp->cl_lock));
 	if (old_delegation != NULL) {
 		if (memcmp(&delegation->stateid, &old_delegation->stateid,
 					sizeof(old_delegation->stateid)) == 0 &&
@@ -246,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			delegation = NULL;
 			goto out;
 		}
-		freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
+		freeme = nfs_detach_delegation_locked(nfsi, server);
 	}
-	list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+	list_add_rcu(&delegation->super_list, &server->delegations);
 	nfsi->delegation_state = delegation->type;
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
@@ -290,73 +321,85 @@ out:
 	return err;
 }
 
-/*
- * Return all delegations that have been marked for return
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
  */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
+	struct nfs_server *server;
 	struct inode *inode;
 	int err = 0;
 
 restart:
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-		if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
-			continue;
-		inode = nfs_delegation_grab_inode(delegation);
-		if (inode == NULL)
-			continue;
-		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
-		spin_unlock(&clp->cl_lock);
-		rcu_read_unlock();
-		if (delegation != NULL) {
-			filemap_flush(inode->i_mapping);
-			err = __nfs_inode_return_delegation(inode, delegation, 0);
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
+							&delegation->flags))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL)
+				continue;
+			delegation = nfs_detach_delegation(NFS_I(inode),
+								server);
+			rcu_read_unlock();
+
+			if (delegation != NULL) {
+				filemap_flush(inode->i_mapping);
+				err = __nfs_inode_return_delegation(inode,
+								delegation, 0);
+			}
+			iput(inode);
+			if (!err)
+				goto restart;
+			set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+			return err;
 		}
-		iput(inode);
-		if (!err)
-			goto restart;
-		set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-		return err;
 	}
 	rcu_read_unlock();
 	return 0;
 }
 
-/*
- * This function returns the delegation without reclaiming opens
- * or protecting against delegation reclaims.
- * It is therefore really only safe to be called from
- * nfs4_clear_inode()
+/**
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
  */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
 	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-		spin_unlock(&clp->cl_lock);
+		delegation = nfs_detach_delegation(nfsi, server);
 		if (delegation != NULL)
 			nfs_do_return_delegation(inode, delegation, 0);
 	}
 }
 
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int err = 0;
 
 	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-		spin_unlock(&clp->cl_lock);
+		delegation = nfs_detach_delegation(nfsi, server);
 		if (delegation != NULL) {
 			nfs_wb_all(inode);
 			err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -365,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode)
 	return err;
 }
 
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+	struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
+
 	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
 	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
 
-/*
- * Return all delegations associated to a super block
+/**
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
  */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-	struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_delegation *delegation;
 
 	if (clp == NULL)
 		return;
+
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		spin_lock(&delegation->lock);
-		if (delegation->inode != NULL && delegation->inode->i_sb == sb)
-			set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+		set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
 		spin_unlock(&delegation->lock);
 	}
 	rcu_read_unlock();
+
 	if (nfs_client_return_marked_delegations(clp) != 0)
 		nfs4_schedule_state_manager(clp);
 }
 
-static
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
+						 fmode_t flags)
 {
 	struct nfs_delegation *delegation;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
 			continue;
 		if (delegation->type & flags)
-			nfs_mark_return_delegation(clp, delegation);
+			nfs_mark_return_delegation(delegation);
 	}
+}
+
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+							fmode_t flags)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_all_delegation_types(server, flags);
 	rcu_read_unlock();
 }
 
@@ -419,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
 		nfs4_schedule_state_manager(clp);
 }
 
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
 	nfs_client_mark_return_all_delegation_types(clp, flags);
 	nfs_delegation_run_state_manager(clp);
 }
 
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
 	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
 
-/*
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+/**
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
  */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -440,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
 	nfs_client_mark_return_all_delegations(clp);
 }
 
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
 	struct nfs_delegation *delegation;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
 			continue;
-		nfs_mark_return_delegation(clp, delegation);
+		nfs_mark_return_delegation(delegation);
 	}
-	rcu_read_unlock();
 }
 
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-	nfs_client_mark_return_unreferenced_delegations(clp);
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_mark_return_unreferenced_delegations(server);
+	rcu_read_unlock();
+
 	nfs_delegation_run_state_manager(clp);
 }
 
-/*
- * Asynchronous delegation recall!
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
  */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+				      const nfs4_stateid *stateid)
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_delegation *delegation;
@@ -474,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 		rcu_read_unlock();
 		return -ENOENT;
 	}
-
-	nfs_mark_return_delegation(clp, delegation);
+	nfs_mark_return_delegation(delegation);
 	rcu_read_unlock();
+
 	nfs_delegation_run_state_manager(clp);
 	return 0;
 }
 
-/*
- * Retrieve the inode associated with a delegation
- */
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+				 const struct nfs_fh *fhandle)
 {
 	struct nfs_delegation *delegation;
 	struct inode *res = NULL;
-	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		spin_lock(&delegation->lock);
 		if (delegation->inode != NULL &&
 		    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -499,49 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
 		if (res != NULL)
 			break;
 	}
+	return res;
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+					const struct nfs_fh *fhandle)
+{
+	struct nfs_server *server;
+	struct inode *res = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		res = nfs_delegation_find_inode_server(server, fhandle);
+		if (res != NULL)
+			break;
+	}
 	rcu_read_unlock();
 	return res;
 }
 
-/*
- * Mark all delegations as needing to be reclaimed
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
  */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
-		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_delegation_mark_reclaim_server(server);
 	rcu_read_unlock();
 }
 
-/*
- * Reap all unclaimed delegations after reboot recovery is done
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
  */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
+	struct nfs_server *server;
 	struct inode *inode;
+
 restart:
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-		if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
-			continue;
-		inode = nfs_delegation_grab_inode(delegation);
-		if (inode == NULL)
-			continue;
-		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
-		spin_unlock(&clp->cl_lock);
-		rcu_read_unlock();
-		if (delegation != NULL)
-			nfs_free_delegation(delegation);
-		iput(inode);
-		goto restart;
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+						&delegation->flags) == 0)
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL)
+				continue;
+			delegation = nfs_detach_delegation(NFS_I(inode),
+								server);
+			rcu_read_unlock();
+
+			if (delegation != NULL)
+				nfs_free_delegation(delegation);
+			iput(inode);
+			goto restart;
+		}
 	}
 	rcu_read_unlock();
 }
 
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+	int ret = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		if (!list_empty(&server->delegations)) {
+			ret = 1;
+			break;
+		}
+	rcu_read_unlock();
+	return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda1..d9322e490c5 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d33da530097..2c3eb33b904 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,8 +33,8 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
+#include <linux/xattr.h>
 
 #include "delegation.h"
 #include "iostat.h"
@@ -125,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
-	.getxattr       = nfs4_getxattr,
-	.setxattr       = nfs4_setxattr,
-	.listxattr      = nfs4_listxattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
 };
 
 #endif /* CONFIG_NFS_V4 */
@@ -172,7 +173,7 @@ struct nfs_cache_array {
 	struct nfs_cache_array_entry array[0];
 };
 
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
@@ -378,14 +379,14 @@ error:
 	return error;
 }
 
-/* Fill in an entry based on the xdr code stored in desc->page */
-static
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
+		      struct nfs_entry *entry, struct xdr_stream *xdr)
 {
-	__be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	int error;
 
+	error = desc->decode(xdr, entry, desc->plus);
+	if (error)
+		return error;
 	entry->fattr->time_start = desc->timestamp;
 	entry->fattr->gencount = desc->gencount;
 	return 0;
@@ -438,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	if (dentry == NULL)
 		return;
 
-	d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
 	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
 	if (IS_ERR(inode))
 		goto out;
@@ -459,25 +459,26 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-				void *xdr_page, struct page *page, unsigned int buflen)
+				struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
 	struct xdr_stream stream;
-	struct xdr_buf buf;
-	__be32 *ptr = xdr_page;
+	struct xdr_buf buf = {
+		.pages = xdr_pages,
+		.page_len = buflen,
+		.buflen = buflen,
+		.len = buflen,
+	};
+	struct page *scratch;
 	struct nfs_cache_array *array;
 	unsigned int count = 0;
 	int status;
 
-	buf.head->iov_base = xdr_page;
-	buf.head->iov_len = buflen;
-	buf.tail->iov_len = 0;
-	buf.page_base = 0;
-	buf.page_len = 0;
-	buf.buflen = buf.head->iov_len;
-	buf.len = buf.head->iov_len;
-
-	xdr_init_decode(&stream, &buf, ptr);
+	scratch = alloc_page(GFP_KERNEL);
+	if (scratch == NULL)
+		return -ENOMEM;
 
+	xdr_init_decode(&stream, &buf, NULL);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
 	do {
 		status = xdr_decode(desc, entry, &stream);
@@ -506,6 +507,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 		} else
 			status = PTR_ERR(array);
 	}
+
+	put_page(scratch);
 	return status;
 }
 
@@ -521,7 +524,6 @@ static
 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
 		unsigned int npages)
 {
-	vm_unmap_ram(ptr, npages);
 	nfs_readdir_free_pagearray(pages, npages);
 }
 
@@ -530,9 +532,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
  * to nfs_readdir_free_large_page
  */
 static
-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-	void *ptr;
 	unsigned int i;
 
 	for (i = 0; i < npages; i++) {
@@ -541,13 +542,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
 			goto out_freepages;
 		pages[i] = page;
 	}
+	return 0;
 
-	ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-	if (!IS_ERR_OR_NULL(ptr))
-		return ptr;
 out_freepages:
 	nfs_readdir_free_pagearray(pages, i);
-	return NULL;
+	return -ENOMEM;
 }
 
 static
@@ -566,6 +565,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	entry.eof = 0;
 	entry.fh = nfs_alloc_fhandle();
 	entry.fattr = nfs_alloc_fattr();
+	entry.server = NFS_SERVER(inode);
 	if (entry.fh == NULL || entry.fattr == NULL)
 		goto out;
 
@@ -577,8 +577,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	array->eof_index = -1;
 
-	pages_ptr = nfs_readdir_large_page(pages, array_size);
-	if (!pages_ptr)
+	status = nfs_readdir_large_page(pages, array_size);
+	if (status < 0)
 		goto out_release_array;
 	do {
 		unsigned int pglen;
@@ -587,7 +587,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		if (status < 0)
 			break;
 		pglen = status;
-		status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
 		if (status < 0) {
 			if (status == -ENOSPC)
 				status = 0;
@@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 
-	if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+	if (IS_AUTOMOUNT(inode))
 		return 0;
 	if (nd != NULL) {
 		/* VFS wants an on-the-wire revalidation */
@@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
 };
 
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1192,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
 		goto out;
 
-	d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
-
 	/*
 	 * If we're doing an exclusive create, optimize away the lookup
 	 * but don't hash the dentry.
@@ -1221,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		goto out_unblock_sillyrename;
 	}
 	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
-	res = (struct dentry *)inode;
+	res = ERR_CAST(inode);
 	if (IS_ERR(res))
 		goto out_unblock_sillyrename;
 
@@ -1248,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs_open_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
+	.d_automount	= nfs_d_automount,
 };
 
 /*
@@ -1337,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		res = ERR_PTR(-ENAMETOOLONG);
 		goto out;
 	}
-	d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
 
 	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
 	 * the dentry. */
@@ -1355,8 +1354,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	if (nd->flags & LOOKUP_CREATE) {
 		attr.ia_mode = nd->intent.open.create_mode;
 		attr.ia_valid = ATTR_MODE;
-		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current_umask();
+		attr.ia_mode &= ~current_umask();
 	} else {
 		open_flags &= ~(O_EXCL | O_CREAT);
 		attr.ia_valid = 0;
@@ -1410,11 +1408,15 @@ no_open:
 static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct dentry *parent = NULL;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	struct inode *dir;
 	struct nfs_open_context *ctx;
 	int openflags, ret = 0;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
 	if (!is_atomic_open(nd) || d_mountpoint(dentry))
 		goto no_open;
 
@@ -1583,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
 	struct iattr attr;
 	int error;
+	int open_flags = 0;
 
 	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1590,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
 
-	error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+	if ((nd->flags & LOOKUP_CREATE) != 0)
+		open_flags = nd->intent.open.flags;
+
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
 	if (error != 0)
 		goto out_err;
 	return 0;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 5596c6a2881..b5ffe8fa291 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -119,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	}
 
 	security_d_instantiate(ret, inode);
-
-	if (ret->d_op == NULL)
-		d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
 	nfs_free_fattr(fsinfo.fattr);
 	return ret;
@@ -227,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 
 	security_d_instantiate(ret, inode);
 
-	if (ret->d_op == NULL)
-		d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
-
 out:
 	nfs_free_fattr(fattr);
 	dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b138..18696882f1c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
 	return nfs_idmap_lookup_name(gid, "group", buf, buflen);
 }
 
-#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 017daa3bed3..d8512423ba7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				else
 					inode->i_op = &nfs_mountpoint_inode_operations;
 				inode->i_fop = NULL;
-				set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+				inode->i_flags |= S_AUTOMOUNT;
 			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
@@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Update the fsid? */
 	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
-			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+			!IS_AUTOMOUNT(inode))
 		server->fsid = fattr->fsid;
 
 	/*
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 void nfs4_evict_inode(struct inode *inode)
 {
+	pnfs_destroy_layout(NFS_I(inode));
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
-	pnfs_destroy_layout(NFS_I(inode));
 	/* If we are holding a delegation, return it! */
 	nfs_inode_return_delegation_noreclaim(inode);
 	/* First call standard NFS clear_inode() code */
@@ -1619,6 +1619,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
+	nfs_cleanup_cb_ident_idr();
 	unregister_nfs_fs();
 	nfs_fs_proc_exit();
 	nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e6356b750b7..4644f04b4b4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,13 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
 
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
+			   int);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
@@ -185,17 +189,20 @@ extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(int);
+extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
 
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
 
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+				struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
@@ -245,6 +252,7 @@ extern char *nfs_path(const char *base,
 		      const struct dentry *droot,
 		      const struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
 
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 4f981f1f668..d4c2d6b7507 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
 		.authflavor	= RPC_AUTH_UNIX,
 		.flags		= RPC_CLNT_CREATE_NOPING,
 	};
-	struct mountres	result;
 	struct rpc_message msg	= {
 		.rpc_argp	= info->dirpath,
-		.rpc_resp	= &result,
 	};
 	struct rpc_clnt *clnt;
 	int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
 	clnt = rpc_create(&args);
-	if (unlikely(IS_ERR(clnt)))
+	if (IS_ERR(clnt))
 		goto out_clnt_err;
 
 	dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
  * XDR encode/decode functions for MOUNT
  */
 
-static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 {
 	const u32 pathname_len = strlen(pathname);
 	__be32 *p;
 
-	if (unlikely(pathname_len > MNTPATHLEN))
-		return -EIO;
-
-	p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
-	if (unlikely(p == NULL))
-		return -EIO;
+	BUG_ON(pathname_len > MNTPATHLEN);
+	p = xdr_reserve_space(xdr, 4 + pathname_len);
 	xdr_encode_opaque(p, pathname, pathname_len);
-
-	return 0;
 }
 
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
-			   const char *dirpath)
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const char *dirpath)
 {
-	struct xdr_stream xdr;
-
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	return encode_mntdirpath(&xdr, dirpath);
+	encode_mntdirpath(xdr, dirpath);
 }
 
 /*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
 	u32 status;
 	__be32 *p;
 
-	p = xdr_inline_decode(xdr, sizeof(status));
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
-	status = ntohl(*p);
+	status = be32_to_cpup(p);
 
 	for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
 		if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
 	return 0;
 }
 
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
-			    struct mountres *res)
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct mountres *res)
 {
-	struct xdr_stream xdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-
-	status = decode_status(&xdr, res);
+	status = decode_status(xdr, res);
 	if (unlikely(status != 0 || res->errno != 0))
 		return status;
-	return decode_fhandle(&xdr, res);
+	return decode_fhandle(xdr, res);
 }
 
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
 	u32 status;
 	__be32 *p;
 
-	p = xdr_inline_decode(xdr, sizeof(status));
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
-	status = ntohl(*p);
+	status = be32_to_cpup(p);
 
 	for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
 		if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
 	u32 size;
 	__be32 *p;
 
-	p = xdr_inline_decode(xdr, sizeof(size));
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
 
-	size = ntohl(*p++);
+	size = be32_to_cpup(p);
 	if (size > NFS3_FHSIZE || size == 0)
 		return -EIO;
 
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
 	if (*count == 0)
 		return 0;
 
-	p = xdr_inline_decode(xdr, sizeof(entries));
+	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		return -EIO;
-	entries = ntohl(*p);
+	entries = be32_to_cpup(p);
 	dprintk("NFS: received %u auth flavors\n", entries);
 	if (entries > NFS_MAX_SECFLAVORS)
 		entries = NFS_MAX_SECFLAVORS;
 
-	p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+	p = xdr_inline_decode(xdr, 4 * entries);
 	if (unlikely(p == NULL))
 		return -EIO;
 
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
 		entries = *count;
 
 	for (i = 0; i < entries; i++) {
-		flavors[i] = ntohl(*p++);
+		flavors[i] = be32_to_cpup(p++);
 		dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
 	}
 	*count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
 	return 0;
 }
 
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
-			     struct mountres *res)
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 struct mountres *res)
 {
-	struct xdr_stream xdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-
-	status = decode_fhs_status(&xdr, res);
+	status = decode_fhs_status(xdr, res);
 	if (unlikely(status != 0 || res->errno != 0))
 		return status;
-	status = decode_fhandle3(&xdr, res);
+	status = decode_fhandle3(xdr, res);
 	if (unlikely(status != 0)) {
 		res->errno = -EBADHANDLE;
 		return 0;
 	}
-	return decode_auth_flavors(&xdr, res);
+	return decode_auth_flavors(xdr, res);
 }
 
 static struct rpc_procinfo mnt_procedures[] = {
 	[MOUNTPROC_MNT] = {
 		.p_proc		= MOUNTPROC_MNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
-		.p_decode	= (kxdrproc_t)mnt_dec_mountres,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_replen	= MNT_dec_mountres_sz,
 		.p_statidx	= MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
 	},
 	[MOUNTPROC_UMNT] = {
 		.p_proc		= MOUNTPROC_UMNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_statidx	= MOUNTPROC_UMNT,
 		.p_name		= "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
 	[MOUNTPROC3_MNT] = {
 		.p_proc		= MOUNTPROC3_MNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
-		.p_decode	= (kxdrproc_t)mnt_dec_mountres3,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
+		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres3,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_replen	= MNT_dec_mountres3_sz,
 		.p_statidx	= MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
 	},
 	[MOUNTPROC3_UMNT] = {
 		.p_proc		= MOUNTPROC3_UMNT,
-		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,
 		.p_arglen	= MNT_enc_dirpath_sz,
 		.p_statidx	= MOUNTPROC3_UMNT,
 		.p_name		= "UMOUNT",
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c1..f32b8603dca 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
 }
 
 /*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
- * @nd - nameidata info
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
  *
  * When we encounter a mountpoint on the server, we want to set up
  * a mountpoint on the client too, to prevent inode numbers from
@@ -109,87 +108,65 @@ Elong:
  * situation, and that different filesystems may want to use
  * different security flavours.
  */
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
 {
 	struct vfsmount *mnt;
-	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
 	struct dentry *parent;
 	struct nfs_fh *fh = NULL;
 	struct nfs_fattr *fattr = NULL;
 	int err;
 
-	dprintk("--> nfs_follow_mountpoint()\n");
+	dprintk("--> nfs_d_automount()\n");
 
-	err = -ESTALE;
-	if (IS_ROOT(dentry))
-		goto out_err;
+	mnt = ERR_PTR(-ESTALE);
+	if (IS_ROOT(path->dentry))
+		goto out_nofree;
 
-	err = -ENOMEM;
+	mnt = ERR_PTR(-ENOMEM);
 	fh = nfs_alloc_fhandle();
 	fattr = nfs_alloc_fattr();
 	if (fh == NULL || fattr == NULL)
-		goto out_err;
+		goto out;
 
 	dprintk("%s: enter\n", __func__);
-	dput(nd->path.dentry);
-	nd->path.dentry = dget(dentry);
 
-	/* Look it up again */
-	parent = dget_parent(nd->path.dentry);
+	/* Look it up again to get its attributes */
+	parent = dget_parent(path->dentry);
 	err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
-						  &nd->path.dentry->d_name,
+						  &path->dentry->d_name,
 						  fh, fattr);
 	dput(parent);
-	if (err != 0)
-		goto out_err;
+	if (err != 0) {
+		mnt = ERR_PTR(err);
+		goto out;
+	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+		mnt = nfs_do_refmount(path->mnt, path->dentry);
 	else
-		mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
-				      fattr);
-	err = PTR_ERR(mnt);
+		mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
-		goto out_err;
+		goto out;
 
-	mntget(mnt);
-	err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
-			   &nfs_automount_list);
-	if (err < 0) {
-		mntput(mnt);
-		if (err == -EBUSY)
-			goto out_follow;
-		goto out_err;
-	}
-	path_put(&nd->path);
-	nd->path.mnt = mnt;
-	nd->path.dentry = dget(mnt->mnt_root);
+	dprintk("%s: done, success\n", __func__);
+	mntget(mnt); /* prevent immediate expiration */
+	mnt_set_expiry(mnt, &nfs_automount_list);
 	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
 out:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fh);
-	dprintk("%s: done, returned %d\n", __func__, err);
-
-	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
-	return ERR_PTR(err);
-out_err:
-	path_put(&nd->path);
-	goto out;
-out_follow:
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path))
-		;
-	err = 0;
-	goto out;
+out_nofree:
+	dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+	return mnt;
 }
 
 const struct inode_operations nfs_mountpoint_inode_operations = {
-	.follow_link	= nfs_follow_mountpoint,
 	.getattr	= nfs_getattr,
 };
 
 const struct inode_operations nfs_referral_inode_operations = {
-	.follow_link	= nfs_follow_mountpoint,
 };
 
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5914a1911c9..792cb13a430 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,584 +61,1008 @@
 #define NFS_readdirres_sz	(1)
 #define NFS_statfsres_sz	(1+NFS_info_sz)
 
+
 /*
- * Common NFS XDR functions as inlines
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
  */
-static inline __be32 *
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
 {
-	memcpy(p, fhandle->data, NFS2_FHSIZE);
-	return p + XDR_QUADLEN(NFS2_FHSIZE);
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
 
-static inline __be32 *
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-	/* NFSv2 handles have a fixed length */
-	fhandle->size = NFS2_FHSIZE;
-	memcpy(fhandle->data, p, NFS2_FHSIZE);
-	return p + XDR_QUADLEN(NFS2_FHSIZE);
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+/*
+ *	typedef opaque	nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+	u32 recvd, count;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, count);
+	result->eof = 0;	/* NFSv2 does not pass EOF flag on the wire. */
+	result->count = count;
+	return count;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ *	enum stat {
+ *		NFS_OK = 0,
+ *		NFSERR_PERM = 1,
+ *		NFSERR_NOENT = 2,
+ *		NFSERR_IO = 5,
+ *		NFSERR_NXIO = 6,
+ *		NFSERR_ACCES = 13,
+ *		NFSERR_EXIST = 17,
+ *		NFSERR_NODEV = 19,
+ *		NFSERR_NOTDIR = 20,
+ *		NFSERR_ISDIR = 21,
+ *		NFSERR_FBIG = 27,
+ *		NFSERR_NOSPC = 28,
+ *		NFSERR_ROFS = 30,
+ *		NFSERR_NAMETOOLONG = 63,
+ *		NFSERR_NOTEMPTY = 66,
+ *		NFSERR_DQUOT = 69,
+ *		NFSERR_STALE = 70,
+ *		NFSERR_WFLUSH = 99
+ *	};
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
-static inline __be32*
-xdr_encode_time(__be32 *p, struct timespec *timep)
+/*
+ * 2.3.2.  ftype
+ *
+ *	enum ftype {
+ *		NFNON = 0,
+ *		NFREG = 1,
+ *		NFDIR = 2,
+ *		NFBLK = 3,
+ *		NFCHR = 4,
+ *		NFLNK = 5
+ *	};
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
 {
-	*p++ = htonl(timep->tv_sec);
-	/* Convert nanoseconds into microseconds */
-	*p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0);
+	*type = be32_to_cpup(p++);
+	if (unlikely(*type > NF2FIFO))
+		*type = NFBAD;
 	return p;
 }
 
-static inline __be32*
-xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
+/*
+ * 2.3.3.  fhandle
+ *
+ *	typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-	/*
-	 * Passing the invalid value useconds=1000000 is a
-	 * Sun convention for "set to current server time".
-	 * It's needed to make permissions checks for the
-	 * "touch" program across v2 mounts to Solaris and
-	 * Irix boxes work correctly. See description of
-	 * sattr in section 6.1 of "NFS Illustrated" by
-	 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
-	 */
-	*p++ = htonl(timep->tv_sec);
-	*p++ = htonl(1000000);
+	__be32 *p;
+
+	BUG_ON(fh->size != NFS2_FHSIZE);
+	p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+	memcpy(p, fh->data, NFS2_FHSIZE);
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = NFS2_FHSIZE;
+	memcpy(fh->data, p, NFS2_FHSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.4.  timeval
+ *
+ *	struct timeval {
+ *		unsigned int seconds;
+ *		unsigned int useconds;
+ *	};
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	if (timep->tv_nsec != 0)
+		*p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+	else
+		*p++ = cpu_to_be32(0);
 	return p;
 }
 
-static inline __be32*
-xdr_decode_time(__be32 *p, struct timespec *timep)
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+					      const struct timespec *timep)
 {
-	timep->tv_sec = ntohl(*p++);
-	/* Convert microseconds into nanoseconds */
-	timep->tv_nsec = ntohl(*p++) * 1000;
+	*p++ = cpu_to_be32(timep->tv_sec);
+	*p++ = cpu_to_be32(1000000);
 	return p;
 }
 
-static __be32 *
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
+{
+	timep->tv_sec = be32_to_cpup(p++);
+	timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+	return p;
+}
+
+/*
+ * 2.3.5.  fattr
+ *
+ *	struct fattr {
+ *		ftype		type;
+ *		unsigned int	mode;
+ *		unsigned int	nlink;
+ *		unsigned int	uid;
+ *		unsigned int	gid;
+ *		unsigned int	size;
+ *		unsigned int	blocksize;
+ *		unsigned int	rdev;
+ *		unsigned int	blocks;
+ *		unsigned int	fsid;
+ *		unsigned int	fileid;
+ *		timeval		atime;
+ *		timeval		mtime;
+ *		timeval		ctime;
+ *	};
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
 	u32 rdev, type;
-	type = ntohl(*p++);
-	fattr->mode = ntohl(*p++);
-	fattr->nlink = ntohl(*p++);
-	fattr->uid = ntohl(*p++);
-	fattr->gid = ntohl(*p++);
-	fattr->size = ntohl(*p++);
-	fattr->du.nfs2.blocksize = ntohl(*p++);
-	rdev = ntohl(*p++);
-	fattr->du.nfs2.blocks = ntohl(*p++);
-	fattr->fsid.major = ntohl(*p++);
-	fattr->fsid.minor = 0;
-	fattr->fileid = ntohl(*p++);
-	p = xdr_decode_time(p, &fattr->atime);
-	p = xdr_decode_time(p, &fattr->mtime);
-	p = xdr_decode_time(p, &fattr->ctime);
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
 	fattr->valid |= NFS_ATTR_FATTR_V2;
+
+	p = xdr_decode_ftype(p, &type);
+
+	fattr->mode = be32_to_cpup(p++);
+	fattr->nlink = be32_to_cpup(p++);
+	fattr->uid = be32_to_cpup(p++);
+	fattr->gid = be32_to_cpup(p++);
+	fattr->size = be32_to_cpup(p++);
+	fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+
+	rdev = be32_to_cpup(p++);
 	fattr->rdev = new_decode_dev(rdev);
-	if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+	if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
 		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
 		fattr->rdev = 0;
 	}
+
+	fattr->du.nfs2.blocks = be32_to_cpup(p++);
+	fattr->fsid.major = be32_to_cpup(p++);
+	fattr->fsid.minor = 0;
+	fattr->fileid = be32_to_cpup(p++);
+
+	p = xdr_decode_time(p, &fattr->atime);
+	p = xdr_decode_time(p, &fattr->mtime);
+	xdr_decode_time(p, &fattr->ctime);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * 2.3.6.  sattr
+ *
+ *	struct sattr {
+ *		unsigned int	mode;
+ *		unsigned int	uid;
+ *		unsigned int	gid;
+ *		unsigned int	size;
+ *		timeval		atime;
+ *		timeval		mtime;
+ *	};
+ */
+
+#define NFS2_SATTR_NOT_SET	(0xffffffff)
+
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	return p;
 }
 
-static inline __be32 *
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
 {
-	const __be32 not_set = __constant_htonl(0xFFFFFFFF);
+	__be32 *p;
 
-	*p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
-	*p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
-	*p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
-	*p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
+	p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
 
-	if (attr->ia_valid & ATTR_ATIME_SET) {
+	if (attr->ia_valid & ATTR_MODE)
+		*p++ = cpu_to_be32(attr->ia_mode);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_UID)
+		*p++ = cpu_to_be32(attr->ia_uid);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_GID)
+		*p++ = cpu_to_be32(attr->ia_gid);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+	if (attr->ia_valid & ATTR_SIZE)
+		*p++ = cpu_to_be32((u32)attr->ia_size);
+	else
+		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+
+	if (attr->ia_valid & ATTR_ATIME_SET)
 		p = xdr_encode_time(p, &attr->ia_atime);
-	} else if (attr->ia_valid & ATTR_ATIME) {
+	else if (attr->ia_valid & ATTR_ATIME)
 		p = xdr_encode_current_server_time(p, &attr->ia_atime);
-	} else {
-		*p++ = not_set;
-		*p++ = not_set;
-	}
-
-	if (attr->ia_valid & ATTR_MTIME_SET) {
-		p = xdr_encode_time(p, &attr->ia_mtime);
-	} else if (attr->ia_valid & ATTR_MTIME) {
-		p = xdr_encode_current_server_time(p, &attr->ia_mtime);
-	} else {
-		*p++ = not_set;	
-		*p++ = not_set;
-	}
-  	return p;
+	else
+		p = xdr_time_not_set(p);
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		xdr_encode_time(p, &attr->ia_mtime);
+	else if (attr->ia_valid & ATTR_MTIME)
+		xdr_encode_current_server_time(p, &attr->ia_mtime);
+	else
+		xdr_time_not_set(p);
 }
 
 /*
- * NFS encode functions
+ * 2.3.7.  filename
+ *
+ *	typedef string filename<MAXNAMLEN>;
  */
+static void encode_filename(struct xdr_stream *xdr,
+			    const char *name, u32 length)
+{
+	__be32 *p;
+
+	BUG_ON(length > NFS2_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
+}
+
+static int decode_filename_inline(struct xdr_stream *xdr,
+				  const char **name, u32 *length)
+{
+	__be32 *p;
+	u32 count;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*name = (const char *)p;
+	*length = count;
+	return 0;
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
- * Encode file handle argument
- * GETATTR, READLINK, STATFS
+ * 2.3.8.  path
+ *
+ *	typedef string path<MAXPATHLEN>;
  */
-static int
-nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
 {
-	p = xdr_encode_fhandle(p, fh);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	__be32 *p;
+
+	BUG_ON(length > NFS2_MAXPATHLEN);
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_path(struct xdr_stream *xdr)
+{
+	u32 length, recvd;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p);
+	if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+		goto out_size;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(length > recvd))
+		goto out_cheating;
+
+	xdr_read_pages(xdr, length);
+	xdr_terminate_string(xdr->buf, length);
 	return 0;
+out_size:
+	dprintk("NFS: returned pathname too long: %u\n", length);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"length %u > received %u\n", length, recvd);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 /*
- * Encode SETATTR arguments
+ * 2.3.9.  attrstat
+ *
+ *	union attrstat switch (stat status) {
+ *	case NFS_OK:
+ *		fattr attributes;
+ *	default:
+ *		void;
+ *	};
  */
-static int
-nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_sattr(p, args->sattr);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_fattr(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Encode directory ops argument
- * LOOKUP, RMDIR
+ * 2.3.10.  diropargs
+ *
+ *	struct diropargs {
+ *		fhandle  dir;
+ *		filename name;
+ *	};
  */
-static int
-nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			     const char *name, u32 length)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name, args->len);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_fhandle(xdr, fh);
+	encode_filename(xdr, name, length);
 }
 
 /*
- * Encode REMOVE argument
+ * 2.3.11.  diropres
+ *
+ *	union diropres switch (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			fhandle file;
+ *			fattr   attributes;
+ *		} diropok;
+ *	default:
+ *		void;
+ *	};
  */
-static int
-nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name.name, args->name.len);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	int error;
+
+	error = decode_fhandle(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_fattr(xdr, result->fattr);
+out:
+	return error;
+}
+
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_diropok(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
+
 /*
- * Arguments to a READ call. Since we read data directly into the page
- * cache, we also set up the reply iovec here so that iov[1] points
- * exactly to the page we want to fetch.
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
  */
-static int
-nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+				 struct xdr_stream *xdr,
+				 const struct nfs_fh *fh)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-	u32 offset = (u32)args->offset;
+	encode_fhandle(xdr, fh);
+}
+
+/*
+ * 2.2.3.  sattrargs
+ *
+ *	struct sattrargs {
+ *		fhandle file;
+ *		sattr attributes;
+ *	};
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_sattrargs *args)
+{
+	encode_fhandle(xdr, args->fh);
+	encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_diropargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_readlinkargs *args)
+{
+	encode_fhandle(xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS_readlinkres_sz);
+}
+
+/*
+ * 2.2.7.  readargs
+ *
+ *	struct readargs {
+ *		fhandle file;
+ *		unsigned offset;
+ *		unsigned count;
+ *		unsigned totalcount;
+ *	};
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+			    const struct nfs_readargs *args)
+{
+	u32 offset = args->offset;
 	u32 count = args->count;
+	__be32 *p;
 
-	p = xdr_encode_fhandle(p, args->fh);
-	*p++ = htonl(offset);
-	*p++ = htonl(count);
-	*p++ = htonl(count);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	encode_fhandle(xdr, args->fh);
 
-	/* Inline the page array */
-	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
-	xdr_inline_pages(&req->rq_rcv_buf, replen,
-			 args->pages, args->pgbase, count);
+	p = xdr_reserve_space(xdr, 4 + 4 + 4);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(count);
+	*p = cpu_to_be32(count);
+}
+
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_readargs *args)
+{
+	encode_readargs(xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS_readres_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
-	return 0;
 }
 
 /*
- * Decode READ reply
+ * 2.2.9.  writeargs
+ *
+ *	struct writeargs {
+ *		fhandle file;
+ *		unsigned beginoffset;
+ *		unsigned offset;
+ *		unsigned totalcount;
+ *		nfsdata data;
+ *	};
  */
-static int
-nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+static void encode_writeargs(struct xdr_stream *xdr,
+			     const struct nfs_writeargs *args)
 {
-	struct kvec *iov = req->rq_rcv_buf.head;
-	size_t hdrlen;
-	u32 count, recvd;
-	int status;
-
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-	p = xdr_decode_fattr(p, res->fattr);
-
-	count = ntohl(*p++);
-	res->eof = 0;
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-	if (iov->iov_len < hdrlen) {
-		dprintk("NFS: READ reply header overflowed:"
-				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
-		return -errno_NFSERR_IO;
-	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READ header is short. iovec will be shifted.\n");
-		xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-	}
+	u32 offset = args->offset;
+	u32 count = args->count;
+	__be32 *p;
 
-	recvd = req->rq_rcv_buf.len - hdrlen;
-	if (count > recvd) {
-		dprintk("NFS: server cheating in read reply: "
-			"count %u > recvd %u\n", count, recvd);
-		count = recvd;
-	}
+	encode_fhandle(xdr, args->fh);
 
-	dprintk("RPC:      readres OK count %u\n", count);
-	if (count < res->count)
-		res->count = count;
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(offset);
+	*p++ = cpu_to_be32(count);
 
-	return count;
+	/* nfsdata */
+	*p = cpu_to_be32(count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
 
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_writeargs *args)
+{
+	encode_writeargs(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
+}
 
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 2.2.10.  createargs
+ *
+ *	struct createargs {
+ *		diropargs where;
+ *		sattr attributes;
+ *	};
  */
-static int
-nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_createargs *args)
 {
-	struct xdr_buf *sndbuf = &req->rq_snd_buf;
-	u32 offset = (u32)args->offset;
-	u32 count = args->count;
-
-	p = xdr_encode_fhandle(p, args->fh);
-	*p++ = htonl(offset);
-	*p++ = htonl(offset);
-	*p++ = htonl(count);
-	*p++ = htonl(count);
-	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+	encode_diropargs(xdr, args->fh, args->name, args->len);
+	encode_sattr(xdr, args->sattr);
+}
 
-	/* Copy the page array */
-	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
-	sndbuf->flags |= XDRBUF_WRITE;
-	return 0;
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_removeargs *args)
+{
+	encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
 }
 
 /*
- * Encode create arguments
- * CREATE, MKDIR
+ * 2.2.12.  renameargs
+ *
+ *	struct renameargs {
+ *		diropargs from;
+ *		diropargs to;
+ *	};
  */
-static int
-nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_renameargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name, args->len);
-	p = xdr_encode_sattr(p, args->sattr);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+
+	encode_diropargs(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs(xdr, args->new_dir, new->name, new->len);
 }
 
 /*
- * Encode RENAME arguments
+ * 2.2.13.  linkargs
+ *
+ *	struct linkargs {
+ *		fhandle from;
+ *		diropargs to;
+ *	};
  */
-static int
-nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+				  struct xdr_stream *xdr,
+				  const struct nfs_linkargs *args)
 {
-	p = xdr_encode_fhandle(p, args->old_dir);
-	p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
-	p = xdr_encode_fhandle(p, args->new_dir);
-	p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_fhandle(xdr, args->fromfh);
+	encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
 }
 
 /*
- * Encode LINK arguments
+ * 2.2.14.  symlinkargs
+ *
+ *	struct symlinkargs {
+ *		diropargs from;
+ *		path to;
+ *		sattr attributes;
+ *	};
  */
-static int
-nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_symlinkargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_fhandle(p, args->tofh);
-	p = xdr_encode_array(p, args->toname, args->tolen);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_path(xdr, args->pages, args->pathlen);
+	encode_sattr(xdr, args->sattr);
 }
 
 /*
- * Encode SYMLINK arguments
+ * 2.2.17.  readdirargs
+ *
+ *	struct readdirargs {
+ *		fhandle dir;
+ *		nfscookie cookie;
+ *		unsigned count;
+ *	};
  */
-static int
-nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
+static void encode_readdirargs(struct xdr_stream *xdr,
+			       const struct nfs_readdirargs *args)
 {
-	struct xdr_buf *sndbuf = &req->rq_snd_buf;
-	size_t pad;
+	__be32 *p;
 
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	*p++ = htonl(args->pathlen);
-	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+	encode_fhandle(xdr, args->fh);
 
-	xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+	p = xdr_reserve_space(xdr, 4 + 4);
+	*p++ = cpu_to_be32(args->cookie);
+	*p = cpu_to_be32(args->count);
+}
 
-	/*
-	 * xdr_encode_pages may have added a few bytes to ensure the
-	 * pathname ends on a 4-byte boundary.  Start encoding the
-	 * attributes after the pad bytes.
-	 */
-	pad = sndbuf->tail->iov_len;
-	if (pad > 0)
-		p++;
-	p = xdr_encode_sattr(p, args->sattr);
-	sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
-	return 0;
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_readdirargs *args)
+{
+	encode_readdirargs(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+					args->count, NFS_readdirres_sz);
 }
 
 /*
- * Encode arguments to readdir call
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
  */
-static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
+
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     void *__unused)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-	u32 count = args->count;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
 
-	p = xdr_encode_fhandle(p, args->fh);
-	*p++ = htonl(args->cookie);
-	*p++ = htonl(count); /* see above */
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_fattr *result)
+{
+	return decode_attrstat(xdr, result);
+}
 
-	/* Inline the page array */
-	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2;
-	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
-	return 0;
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_diropok *result)
+{
+	return decode_diropres(xdr, result);
 }
 
 /*
- * Decode the result of a readdir call.
- * We're not really decoding anymore, we just leave the buffer untouched
- * and only check that it is syntactically correct.
- * The real decoding happens in nfs_decode_entry below, called directly
- * from nfs_readdir for each entry.
+ * 2.2.6.  readlinkres
+ *
+ *	union readlinkres switch (stat status) {
+ *	case NFS_OK:
+ *		path data;
+ *	default:
+ *		void;
+ *	};
  */
-static int
-nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+				    struct xdr_stream *xdr, void *__unused)
 {
-	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
-	struct kvec *iov = rcvbuf->head;
-	struct page **page;
-	size_t hdrlen;
-	unsigned int pglen, recvd;
-	int status;
-
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-	if (iov->iov_len < hdrlen) {
-		dprintk("NFS: READDIR reply header overflowed:"
-				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
-		return -errno_NFSERR_IO;
-	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-	}
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_path(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
 
-	pglen = rcvbuf->page_len;
-	recvd = rcvbuf->len - hdrlen;
-	if (pglen > recvd)
-		pglen = recvd;
-	page = rcvbuf->pages;
-	return pglen;
+/*
+ * 2.2.7.  readres
+ *
+ *	union readres switch (stat status) {
+ *	case NFS_OK:
+ *		fattr attributes;
+ *		nfsdata data;
+ *	default:
+ *		void;
+ *	};
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_readres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_fattr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_nfsdata(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_writeres *result)
 {
-	dprintk("nfs: %s: prematurely hit end of receive buffer. "
-		"Remaining buffer length is %tu words.\n",
-		func, xdr->end - xdr->p);
+	/* All NFSv2 writes are "file sync" writes */
+	result->verf->committed = NFS_FILE_SYNC;
+	return decode_attrstat(xdr, result->fattr);
 }
 
-__be32 *
-nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+/**
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *	struct entry {
+ *		unsigned	fileid;
+ *		filename	name;
+ *		nfscookie	cookie;
+ *		entry		*nextentry;
+ *	};
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
 {
 	__be32 *p;
+	int error;
+
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
+	if (unlikely(p == NULL))
 		goto out_overflow;
-	if (!ntohl(*p++)) {
+	if (*p++ == xdr_zero) {
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
+		if (unlikely(p == NULL))
 			goto out_overflow;
-		if (!ntohl(*p++))
-			return ERR_PTR(-EAGAIN);
+		if (*p++ == xdr_zero)
+			return -EAGAIN;
 		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
+		return -EBADCOOKIE;
 	}
 
-	p = xdr_inline_decode(xdr, 8);
-	if (unlikely(!p))
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
 		goto out_overflow;
+	entry->ino = be32_to_cpup(p);
 
-	entry->ino	  = ntohl(*p++);
-	entry->len	  = ntohl(*p++);
+	error = decode_filename_inline(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return error;
 
-	p = xdr_inline_decode(xdr, entry->len + 4);
-	if (unlikely(!p))
+	/*
+	 * The type (size and byte order) of nfscookie isn't defined in
+	 * RFC 1094.  This implementation assumes that it's an XDR uint32.
+	 */
+	entry->prev_cookie = entry->cookie;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
 		goto out_overflow;
-	entry->name	  = (const char *) p;
-	p		 += XDR_QUADLEN(entry->len);
-	entry->prev_cookie	  = entry->cookie;
-	entry->cookie	  = ntohl(*p++);
+	entry->cookie = be32_to_cpup(p);
 
 	entry->d_type = DT_UNKNOWN;
 
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
-
-	return p;
+	return 0;
 
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EAGAIN);
-}
-
-/*
- * NFS XDR decode functions
- */
-/*
- * Decode simple status reply
- */
-static int
-nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
-{
-	int	status;
-
-	if ((status = ntohl(*p++)) != 0)
-		status = nfs_stat_to_errno(status);
-	return status;
+	return -EAGAIN;
 }
 
 /*
- * Decode attrstat reply
- * GETATTR, SETATTR, WRITE
- */
-static int
-nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
-{
-	int	status;
-
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-	xdr_decode_fattr(p, fattr);
-	return 0;
-}
-
-/*
- * Decode diropres reply
- * LOOKUP, CREATE, MKDIR
+ * 2.2.17.  readdirres
+ *
+ *	union readdirres switch (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			entry *entries;
+ *			bool eof;
+ *		} readdirok;
+ *	default:
+ *		void;
+ *	};
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
  */
-static int
-nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
+static int decode_readdirok(struct xdr_stream *xdr)
 {
-	int	status;
+	u32 recvd, pglen;
+	size_t hdrlen;
 
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-	p = xdr_decode_fhandle(p, res->fh);
-	xdr_decode_fattr(p, res->fattr);
-	return 0;
+	pglen = xdr->buf->page_len;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(pglen > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, pglen);
+	return pglen;
+out_cheating:
+	dprintk("NFS: server cheating in readdir result: "
+		"pglen %u > recvd %u\n", pglen, recvd);
+	pglen = recvd;
+	goto out;
 }
 
-/*
- * Encode READLINK args
- */
-static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+				   struct xdr_stream *xdr, void *__unused)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-
-	p = xdr_encode_fhandle(p, args->fh);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-
-	/* Inline the page array */
-	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
-	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
-	return 0;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_readdirok(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode READLINK reply
+ * 2.2.18.  statfsres
+ *
+ *	union statfsres (stat status) {
+ *	case NFS_OK:
+ *		struct {
+ *			unsigned tsize;
+ *			unsigned bsize;
+ *			unsigned blocks;
+ *			unsigned bfree;
+ *			unsigned bavail;
+ *		} info;
+ *	default:
+ *		void;
+ *	};
  */
-static int
-nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
 {
-	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
-	struct kvec *iov = rcvbuf->head;
-	size_t hdrlen;
-	u32 len, recvd;
-	int	status;
-
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-	/* Convert length of symlink */
-	len = ntohl(*p++);
-	if (len >= rcvbuf->page_len) {
-		dprintk("nfs: server returned giant symlink!\n");
-		return -ENAMETOOLONG;
-	}
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-	if (iov->iov_len < hdrlen) {
-		dprintk("NFS: READLINK reply header overflowed:"
-				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
-		return -errno_NFSERR_IO;
-	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-	}
-	recvd = req->rq_rcv_buf.len - hdrlen;
-	if (recvd < len) {
-		dprintk("NFS: server cheating in readlink reply: "
-				"count %u > recvd %u\n", len, recvd);
-		return -EIO;
-	}
+	__be32 *p;
 
-	xdr_terminate_string(rcvbuf, len);
+	p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->tsize  = be32_to_cpup(p++);
+	result->bsize  = be32_to_cpup(p++);
+	result->blocks = be32_to_cpup(p++);
+	result->bfree  = be32_to_cpup(p++);
+	result->bavail = be32_to_cpup(p);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
-/*
- * Decode WRITE reply
- */
-static int
-nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs2_fsstat *result)
 {
-	res->verf->committed = NFS_FILE_SYNC;
-	return nfs_xdr_attrstat(req, p, res->fattr);
+	enum nfs_stat status;
+	int error;
+
+	error = decode_stat(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS_OK)
+		goto out_default;
+	error = decode_info(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
-/*
- * Decode STATFS reply
- */
-static int
-nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
-{
-	int	status;
-
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-
-	res->tsize  = ntohl(*p++);
-	res->bsize  = ntohl(*p++);
-	res->blocks = ntohl(*p++);
-	res->bfree  = ntohl(*p++);
-	res->bavail = ntohl(*p++);
-	return 0;
-}
 
 /*
  * We need to translate between nfs status return values and
  * the local errno values which may not be the same.
  */
-static struct {
+static const struct {
 	int stat;
 	int errno;
 } nfs_errtbl[] = {
@@ -678,28 +1102,30 @@ static struct {
 	{ -1,			-EIO		}
 };
 
-/*
- * Convert an NFS error code to a local one.
- * This one is used jointly by NFSv2 and NFSv3.
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
  */
-int
-nfs_stat_to_errno(int stat)
+int nfs_stat_to_errno(enum nfs_stat status)
 {
 	int i;
 
 	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
-		if (nfs_errtbl[i].stat == stat)
+		if (nfs_errtbl[i].stat == (int)status)
 			return nfs_errtbl[i].errno;
 	}
-	dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+	dprintk("NFS: Unrecognized nfs status value: %u\n", status);
 	return nfs_errtbl[i].errno;
 }
 
 #define PROC(proc, argtype, restype, timer)				\
 [NFSPROC_##proc] = {							\
 	.p_proc	    =  NFSPROC_##proc,					\
-	.p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,			\
-	.p_decode   =  (kxdrproc_t) nfs_xdr_##restype,			\
+	.p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,		\
+	.p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,		\
 	.p_arglen   =  NFS_##argtype##_sz,				\
 	.p_replen   =  NFS_##restype##_sz,				\
 	.p_timer    =  timer,						\
@@ -707,21 +1133,21 @@ nfs_stat_to_errno(int stat)
 	.p_name     =  #proc,						\
 	}
 struct rpc_procinfo	nfs_procedures[] = {
-    PROC(GETATTR,	fhandle,	attrstat, 1),
-    PROC(SETATTR,	sattrargs,	attrstat, 0),
-    PROC(LOOKUP,	diropargs,	diropres, 2),
-    PROC(READLINK,	readlinkargs,	readlinkres, 3),
-    PROC(READ,		readargs,	readres, 3),
-    PROC(WRITE,		writeargs,	writeres, 4),
-    PROC(CREATE,	createargs,	diropres, 0),
-    PROC(REMOVE,	removeargs,	stat, 0),
-    PROC(RENAME,	renameargs,	stat, 0),
-    PROC(LINK,		linkargs,	stat, 0),
-    PROC(SYMLINK,	symlinkargs,	stat, 0),
-    PROC(MKDIR,		createargs,	diropres, 0),
-    PROC(RMDIR,		diropargs,	stat, 0),
-    PROC(READDIR,	readdirargs,	readdirres, 3),
-    PROC(STATFS,	fhandle,	statfsres, 0),
+	PROC(GETATTR,	fhandle,	attrstat,	1),
+	PROC(SETATTR,	sattrargs,	attrstat,	0),
+	PROC(LOOKUP,	diropargs,	diropres,	2),
+	PROC(READLINK,	readlinkargs,	readlinkres,	3),
+	PROC(READ,	readargs,	readres,	3),
+	PROC(WRITE,	writeargs,	writeres,	4),
+	PROC(CREATE,	createargs,	diropres,	0),
+	PROC(REMOVE,	removeargs,	stat,		0),
+	PROC(RENAME,	renameargs,	stat,		0),
+	PROC(LINK,	linkargs,	stat,		0),
+	PROC(SYMLINK,	symlinkargs,	stat,		0),
+	PROC(MKDIR,	createargs,	diropres,	0),
+	PROC(RMDIR,	diropargs,	stat,		0),
+	PROC(READDIR,	readdirargs,	readdirres,	3),
+	PROC(STATFS,	fhandle,	statfsres,	0),
 };
 
 struct rpc_version		nfs_version2 = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06da..01c5e8b1941 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
 #define NFS3_filename_sz	(1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz		(1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz		(21)
-#define NFS3_wcc_attr_sz		(6)
+#define NFS3_cookieverf_sz	(NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz	(6)
 #define NFS3_pre_op_attr_sz	(1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz	(1+NFS3_fattr_sz)
-#define NFS3_wcc_data_sz		(NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
-#define NFS3_fsstat_sz		
-#define NFS3_fsinfo_sz		
-#define NFS3_pathconf_sz		
-#define NFS3_entry_sz		(NFS3_filename_sz+3)
-
-#define NFS3_sattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_wcc_data_sz	(NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
 #define NFS3_diropargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
-#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+
+#define NFS3_getattrargs_sz	(NFS3_fh_sz)
+#define NFS3_setattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz	(NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz	(NFS3_fh_sz)
 #define NFS3_readargs_sz	(NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
 #define NFS3_mkdirargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_symlinkargs_sz	(NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz	(NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_renameargs_sz	(NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz		(NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz	(NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz	(NFS3_fh_sz+3)
 
-#define NFS3_attrstat_sz	(1+NFS3_fattr_sz)
-#define NFS3_wccstat_sz		(1+NFS3_wcc_data_sz)
-#define NFS3_removeres_sz	(NFS3_wccstat_sz)
+#define NFS3_getattrres_sz	(1+NFS3_fattr_sz)
+#define NFS3_setattrres_sz	(1+NFS3_wcc_data_sz)
+#define NFS3_removeres_sz	(NFS3_setattrres_sz)
 #define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1)
@@ -100,1079 +100,2362 @@ static const umode_t nfs_type2fmt[] = {
 	[NF3FIFO] = S_IFIFO,
 };
 
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+				 unsigned int base, unsigned int len,
+				 unsigned int bufsize)
+{
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
+	unsigned int replen;
+
+	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
 		"Remaining buffer length is %tu words.\n",
 		func, xdr->end - xdr->p);
 }
 
+
 /*
- * Common NFS XDR functions as inlines
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
  */
-static inline __be32 *
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
+
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
 {
-	return xdr_encode_array(p, fh->data, fh->size);
+	__be32 *p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(value);
 }
 
-static inline __be32 *
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
 {
-	if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
-		memcpy(fh->data, p, fh->size);
-		return p + XDR_QUADLEN(fh->size);
-	}
-	return NULL;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*value = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	xdr_decode_hyper(p, value);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * fileid3
+ *
+ *	typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+	return xdr_decode_hyper(p, fileid);
+}
+
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+	return decode_uint64(xdr, fileid);
+}
+
+/*
+ * filename3
+ *
+ *	typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+			     const char *name, u32 length)
+{
+	__be32 *p;
+
+	BUG_ON(length > NFS3_MAXNAMLEN);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, name, length);
 }
 
-static inline __be32 *
-xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+static int decode_inline_filename3(struct xdr_stream *xdr,
+				   const char **name, u32 *length)
 {
 	__be32 *p;
+	u32 count;
+
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (count > NFS3_MAXNAMLEN)
+		goto out_nametoolong;
+	p = xdr_inline_decode(xdr, count);
+	if (unlikely(p == NULL))
 		goto out_overflow;
-	fh->size = ntohl(*p++);
+	*name = (const char *)p;
+	*length = count;
+	return 0;
 
-	if (fh->size <= NFS3_FHSIZE) {
-		p = xdr_inline_decode(xdr, fh->size);
-		if (unlikely(!p))
-			goto out_overflow;
-		memcpy(fh->data, p, fh->size);
-		return p + XDR_QUADLEN(fh->size);
-	}
-	return NULL;
+out_nametoolong:
+	dprintk("NFS: returned filename too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * nfspath3
+ *
+ *	typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+			    const u32 length)
+{
+	BUG_ON(length > NFS3_MAXPATHLEN);
+	encode_uint32(xdr, length);
+	xdr_write_pages(xdr, pages, 0, length);
+}
 
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+	u32 recvd, count;
+	size_t hdrlen;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p);
+	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+		goto out_nametoolong;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+
+	xdr_read_pages(xdr, count);
+	xdr_terminate_string(xdr->buf, count);
+	return 0;
+
+out_nametoolong:
+	dprintk("NFS: returned pathname too long: %u\n", count);
+	return -ENAMETOOLONG;
+out_cheating:
+	dprintk("NFS: server cheating in pathname result: "
+		"count %u > recvd %u\n", count, recvd);
+	return -EIO;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EIO);
+	return -EIO;
 }
 
 /*
- * Encode/decode time.
+ * cookie3
+ *
+ *	typedef uint64 cookie3
  */
-static inline __be32 *
-xdr_encode_time3(__be32 *p, struct timespec *timep)
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
 {
-	*p++ = htonl(timep->tv_sec);
-	*p++ = htonl(timep->tv_nsec);
-	return p;
+	return xdr_encode_hyper(p, cookie);
 }
 
-static inline __be32 *
-xdr_decode_time3(__be32 *p, struct timespec *timep)
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
 {
-	timep->tv_sec = ntohl(*p++);
-	timep->tv_nsec = ntohl(*p++);
-	return p;
+	return decode_uint64(xdr, cookie);
+}
+
+/*
+ * cookieverf3
+ *
+ *	typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+	memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+	return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * createverf3
+ *
+ *	typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+	memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * size3
+ *
+ *	typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+	return xdr_decode_hyper(p, size);
+}
+
+/*
+ * nfsstat3
+ *
+ *	enum nfsstat3 {
+ *		NFS3_OK = 0,
+ *		...
+ *	}
+ */
+#define NFS3_OK		NFS_OK
+
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	*status = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * ftype3
+ *
+ *	enum ftype3 {
+ *		NF3REG	= 1,
+ *		NF3DIR	= 2,
+ *		NF3BLK	= 3,
+ *		NF3CHR	= 4,
+ *		NF3LNK	= 5,
+ *		NF3SOCK	= 6,
+ *		NF3FIFO	= 7
+ *	};
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+	BUG_ON(type > NF3FIFO);
+	encode_uint32(xdr, type);
 }
 
-static __be32 *
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
 {
-	unsigned int	type, major, minor;
-	umode_t		fmode;
+	u32 type;
 
-	type = ntohl(*p++);
+	type = be32_to_cpup(p++);
 	if (type > NF3FIFO)
 		type = NF3NON;
-	fmode = nfs_type2fmt[type];
-	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
-	fattr->nlink = ntohl(*p++);
-	fattr->uid = ntohl(*p++);
-	fattr->gid = ntohl(*p++);
-	p = xdr_decode_hyper(p, &fattr->size);
-	p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
-
-	/* Turn remote device info into Linux-specific dev_t */
-	major = ntohl(*p++);
-	minor = ntohl(*p++);
-	fattr->rdev = MKDEV(major, minor);
-	if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
-		fattr->rdev = 0;
+	*mode = nfs_type2fmt[type];
+	return p;
+}
 
-	p = xdr_decode_hyper(p, &fattr->fsid.major);
-	fattr->fsid.minor = 0;
-	p = xdr_decode_hyper(p, &fattr->fileid);
-	p = xdr_decode_time3(p, &fattr->atime);
-	p = xdr_decode_time3(p, &fattr->mtime);
-	p = xdr_decode_time3(p, &fattr->ctime);
+/*
+ * specdata3
+ *
+ *     struct specdata3 {
+ *             uint32  specdata1;
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+	__be32 *p;
 
-	/* Update the mode bits */
-	fattr->valid |= NFS_ATTR_FATTR_V3;
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(MAJOR(rdev));
+	*p = cpu_to_be32(MINOR(rdev));
+}
+
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+	unsigned int major, minor;
+
+	major = be32_to_cpup(p++);
+	minor = be32_to_cpup(p++);
+	*rdev = MKDEV(major, minor);
+	if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+		*rdev = 0;
+	return p;
+}
+
+/*
+ * nfs_fh3
+ *
+ *	struct nfs_fh3 {
+ *		opaque       data<NFS3_FHSIZE>;
+ *	};
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+	__be32 *p;
+
+	BUG_ON(fh->size > NFS3_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + fh->size);
+	xdr_encode_opaque(p, fh->data, fh->size);
+}
+
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	length = be32_to_cpup(p++);
+	if (unlikely(length > NFS3_FHSIZE))
+		goto out_toobig;
+	p = xdr_inline_decode(xdr, length);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	fh->size = length;
+	memcpy(fh->data, p, length);
+	return 0;
+out_toobig:
+	dprintk("NFS: file handle size (%u) too big\n", length);
+	return -E2BIG;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+	memset(fh, 0, sizeof(*fh));
+}
+
+/*
+ * nfstime3
+ *
+ *	struct nfstime3 {
+ *		uint32	seconds;
+ *		uint32	nseconds;
+ *	};
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+	*p++ = cpu_to_be32(timep->tv_sec);
+	*p++ = cpu_to_be32(timep->tv_nsec);
 	return p;
 }
 
-static inline __be32 *
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
 {
+	timep->tv_sec = be32_to_cpup(p++);
+	timep->tv_nsec = be32_to_cpup(p++);
+	return p;
+}
+
+/*
+ * sattr3
+ *
+ *	enum time_how {
+ *		DONT_CHANGE		= 0,
+ *		SET_TO_SERVER_TIME	= 1,
+ *		SET_TO_CLIENT_TIME	= 2
+ *	};
+ *
+ *	union set_mode3 switch (bool set_it) {
+ *	case TRUE:
+ *		mode3	mode;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_uid3 switch (bool set_it) {
+ *	case TRUE:
+ *		uid3	uid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_gid3 switch (bool set_it) {
+ *	case TRUE:
+ *		gid3	gid;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_size3 switch (bool set_it) {
+ *	case TRUE:
+ *		size3	size;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_atime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3	atime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	union set_mtime switch (time_how set_it) {
+ *	case SET_TO_CLIENT_TIME:
+ *		nfstime3  mtime;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct sattr3 {
+ *		set_mode3	mode;
+ *		set_uid3	uid;
+ *		set_gid3	gid;
+ *		set_size3	size;
+ *		set_atime	atime;
+ *		set_mtime	mtime;
+ *	};
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+	u32 nbytes;
+	__be32 *p;
+
+	/*
+	 * In order to make only a single xdr_reserve_space() call,
+	 * pre-compute the total number of bytes to be reserved.
+	 * Six boolean values, one for each set_foo field, are always
+	 * present in the encoded result, so start there.
+	 */
+	nbytes = 6 * 4;
+	if (attr->ia_valid & ATTR_MODE)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_UID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_GID)
+		nbytes += 4;
+	if (attr->ia_valid & ATTR_SIZE)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_ATIME_SET)
+		nbytes += 8;
+	if (attr->ia_valid & ATTR_MTIME_SET)
+		nbytes += 8;
+	p = xdr_reserve_space(xdr, nbytes);
+
 	if (attr->ia_valid & ATTR_MODE) {
 		*p++ = xdr_one;
-		*p++ = htonl(attr->ia_mode & S_IALLUGO);
-	} else {
+		*p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
+	} else
 		*p++ = xdr_zero;
-	}
+
 	if (attr->ia_valid & ATTR_UID) {
 		*p++ = xdr_one;
-		*p++ = htonl(attr->ia_uid);
-	} else {
+		*p++ = cpu_to_be32(attr->ia_uid);
+	} else
 		*p++ = xdr_zero;
-	}
+
 	if (attr->ia_valid & ATTR_GID) {
 		*p++ = xdr_one;
-		*p++ = htonl(attr->ia_gid);
-	} else {
+		*p++ = cpu_to_be32(attr->ia_gid);
+	} else
 		*p++ = xdr_zero;
-	}
+
 	if (attr->ia_valid & ATTR_SIZE) {
 		*p++ = xdr_one;
-		p = xdr_encode_hyper(p, (__u64) attr->ia_size);
-	} else {
+		p = xdr_encode_hyper(p, (u64)attr->ia_size);
+	} else
 		*p++ = xdr_zero;
-	}
+
 	if (attr->ia_valid & ATTR_ATIME_SET) {
 		*p++ = xdr_two;
-		p = xdr_encode_time3(p, &attr->ia_atime);
+		p = xdr_encode_nfstime3(p, &attr->ia_atime);
 	} else if (attr->ia_valid & ATTR_ATIME) {
 		*p++ = xdr_one;
-	} else {
+	} else
 		*p++ = xdr_zero;
-	}
+
 	if (attr->ia_valid & ATTR_MTIME_SET) {
 		*p++ = xdr_two;
-		p = xdr_encode_time3(p, &attr->ia_mtime);
+		xdr_encode_nfstime3(p, &attr->ia_mtime);
 	} else if (attr->ia_valid & ATTR_MTIME) {
-		*p++ = xdr_one;
-	} else {
-		*p++ = xdr_zero;
-	}
-	return p;
+		*p = xdr_one;
+	} else
+		*p = xdr_zero;
+}
+
+/*
+ * fattr3
+ *
+ *	struct fattr3 {
+ *		ftype3		type;
+ *		mode3		mode;
+ *		uint32		nlink;
+ *		uid3		uid;
+ *		gid3		gid;
+ *		size3		size;
+ *		size3		used;
+ *		specdata3	rdev;
+ *		uint64		fsid;
+ *		fileid3		fileid;
+ *		nfstime3	atime;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	umode_t fmode;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
+	p = xdr_decode_ftype3(p, &fmode);
+
+	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+	fattr->nlink = be32_to_cpup(p++);
+	fattr->uid = be32_to_cpup(p++);
+	fattr->gid = be32_to_cpup(p++);
+
+	p = xdr_decode_size3(p, &fattr->size);
+	p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+	p = xdr_decode_specdata3(p, &fattr->rdev);
+
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
+
+	p = xdr_decode_fileid3(p, &fattr->fileid);
+	p = xdr_decode_nfstime3(p, &fattr->atime);
+	p = xdr_decode_nfstime3(p, &fattr->mtime);
+	xdr_decode_nfstime3(p, &fattr->ctime);
+
+	fattr->valid |= NFS_ATTR_FATTR_V3;
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
-static inline __be32 *
-xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
+/*
+ * post_op_attr
+ *
+ *	union post_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		fattr3	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
-	p = xdr_decode_hyper(p, &fattr->pre_size);
-	p = xdr_decode_time3(p, &fattr->pre_mtime);
-	p = xdr_decode_time3(p, &fattr->pre_ctime);
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_fattr3(xdr, fattr);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * wcc_attr
+ *	struct wcc_attr {
+ *		size3		size;
+ *		nfstime3	mtime;
+ *		nfstime3	ctime;
+ *	};
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+
 	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
 		| NFS_ATTR_FATTR_PREMTIME
 		| NFS_ATTR_FATTR_PRECTIME;
-	return p;
-}
 
-static inline __be32 *
-xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
-{
-	if (*p++)
-		p = xdr_decode_fattr(p, fattr);
-	return p;
+	p = xdr_decode_size3(p, &fattr->pre_size);
+	p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
+	xdr_decode_nfstime3(p, &fattr->pre_ctime);
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
-static inline __be32 *
-xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+/*
+ * pre_op_attr
+ *	union pre_op_attr switch (bool attributes_follow) {
+ *	case TRUE:
+ *		wcc_attr	attributes;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ * wcc_data
+ *
+ *	struct wcc_data {
+ *		pre_op_attr	before;
+ *		post_op_attr	after;
+ *	};
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
+	if (unlikely(p == NULL))
 		goto out_overflow;
-	if (ntohl(*p++)) {
-		p = xdr_inline_decode(xdr, 84);
-		if (unlikely(!p))
-			goto out_overflow;
-		p = xdr_decode_fattr(p, fattr);
-	}
-	return p;
+	if (*p != xdr_zero)
+		return decode_wcc_attr(xdr, fattr);
+	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EIO);
+	return -EIO;
 }
 
-static inline __be32 *
-xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
-	if (*p++)
-		return xdr_decode_wcc_attr(p, fattr);
-	return p;
+	int error;
+
+	error = decode_pre_op_attr(xdr, fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, fattr);
+out:
+	return error;
 }
 
+/*
+ * post_op_fh3
+ *
+ *	union post_op_fh3 switch (bool handle_follows) {
+ *	case TRUE:
+ *		nfs_fh3  handle;
+ *	case FALSE:
+ *		void;
+ *	};
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p != xdr_zero)
+		return decode_nfs_fh3(xdr, fh);
+	zero_nfs_fh3(fh);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
 
-static inline __be32 *
-xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
+/*
+ * diropargs3
+ *
+ *	struct diropargs3 {
+ *		nfs_fh3		dir;
+ *		filename3	name;
+ *	};
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+			      const char *name, u32 length)
 {
-	p = xdr_decode_pre_op_attr(p, fattr);
-	return xdr_decode_post_op_attr(p, fattr);
+	encode_nfs_fh3(xdr, fh);
+	encode_filename3(xdr, name, length);
 }
 
+
 /*
- * NFS encode functions
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
  */
 
 /*
- * Encode file handle argument
+ * 3.3.1  GETATTR3args
+ *
+ *	struct GETATTR3args {
+ *		nfs_fh3  object;
+ *	};
  */
-static int
-nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs_fh *fh)
 {
-	p = xdr_encode_fhandle(p, fh);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_nfs_fh3(xdr, fh);
 }
 
 /*
- * Encode SETATTR arguments
+ * 3.3.2  SETATTR3args
+ *
+ *	union sattrguard3 switch (bool check) {
+ *	case TRUE:
+ *		nfstime3  obj_ctime;
+ *	case FALSE:
+ *		void;
+ *	};
+ *
+ *	struct SETATTR3args {
+ *		nfs_fh3		object;
+ *		sattr3		new_attributes;
+ *		sattrguard3	guard;
+ *	};
  */
-static int
-nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
-{
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_sattr(p, args->sattr);
-	*p++ = htonl(args->guard);
-	if (args->guard)
-		p = xdr_encode_time3(p, &args->guardtime);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+static void encode_sattrguard3(struct xdr_stream *xdr,
+			       const struct nfs3_sattrargs *args)
+{
+	__be32 *p;
+
+	if (args->guard) {
+		p = xdr_reserve_space(xdr, 4 + 8);
+		*p++ = xdr_one;
+		xdr_encode_nfstime3(p, &args->guardtime);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		*p = xdr_zero;
+	}
+}
+
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_sattrargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_sattr3(xdr, args->sattr);
+	encode_sattrguard3(xdr, args);
 }
 
 /*
- * Encode directory ops argument
+ * 3.3.3  LOOKUP3args
+ *
+ *	struct LOOKUP3args {
+ *		diropargs3  what;
+ *	};
  */
-static int
-nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_diropargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name, args->len);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
 }
 
 /*
- * Encode REMOVE argument
+ * 3.3.4  ACCESS3args
+ *
+ *	struct ACCESS3args {
+ *		nfs_fh3		object;
+ *		uint32		access;
+ *	};
  */
-static int
-nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void encode_access3args(struct xdr_stream *xdr,
+			       const struct nfs3_accessargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name.name, args->name.len);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->access);
+}
+
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_accessargs *args)
+{
+	encode_access3args(xdr, args);
 }
 
 /*
- * Encode access() argument
+ * 3.3.5  READLINK3args
+ *
+ *	struct READLINK3args {
+ *		nfs_fh3	symlink;
+ *	};
  */
-static int
-nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       const struct nfs3_readlinkargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	*p++ = htonl(args->access);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_nfs_fh3(xdr, args->fh);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->pglen, NFS3_readlinkres_sz);
 }
 
 /*
- * Arguments to a READ call. Since we read data directly into the page
- * cache, we also set up the reply iovec here so that iov[1] points
- * exactly to the page we want to fetch.
+ * 3.3.6  READ3args
+ *
+ *	struct READ3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
  */
-static int
-nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void encode_read3args(struct xdr_stream *xdr,
+			     const struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-	u32 count = args->count;
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
 
-	p = xdr_encode_fhandle(p, args->fh);
+	p = xdr_reserve_space(xdr, 8 + 4);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = htonl(count);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	*p = cpu_to_be32(args->count);
+}
 
-	/* Inline the page array */
-	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
-	xdr_inline_pages(&req->rq_rcv_buf, replen,
-			 args->pages, args->pgbase, count);
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs_readargs *args)
+{
+	encode_read3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, args->pgbase,
+					args->count, NFS3_readres_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
-	return 0;
 }
 
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 3.3.7  WRITE3args
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *		stable_how	stable;
+ *		opaque		data<>;
+ *	};
  */
-static int
-nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void encode_write3args(struct xdr_stream *xdr,
+			      const struct nfs_writeargs *args)
 {
-	struct xdr_buf *sndbuf = &req->rq_snd_buf;
-	u32 count = args->count;
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
 
-	p = xdr_encode_fhandle(p, args->fh);
+	p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = htonl(count);
-	*p++ = htonl(args->stable);
-	*p++ = htonl(count);
-	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-
-	/* Copy the page array */
-	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
-	sndbuf->flags |= XDRBUF_WRITE;
-	return 0;
+	*p++ = cpu_to_be32(args->count);
+	*p++ = cpu_to_be32(args->stable);
+	*p = cpu_to_be32(args->count);
+	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs_writeargs *args)
+{
+	encode_write3args(xdr, args);
+	xdr->buf->flags |= XDRBUF_WRITE;
 }
 
 /*
- * Encode CREATE arguments
+ * 3.3.8  CREATE3args
+ *
+ *	enum createmode3 {
+ *		UNCHECKED = 0,
+ *		GUARDED   = 1,
+ *		EXCLUSIVE = 2
+ *	};
+ *
+ *	union createhow3 switch (createmode3 mode) {
+ *	case UNCHECKED:
+ *	case GUARDED:
+ *		sattr3       obj_attributes;
+ *	case EXCLUSIVE:
+ *		createverf3  verf;
+ *	};
+ *
+ *	struct CREATE3args {
+ *		diropargs3	where;
+ *		createhow3	how;
+ *	};
  */
-static int
-nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
+static void encode_createhow3(struct xdr_stream *xdr,
+			      const struct nfs3_createargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name, args->len);
-
-	*p++ = htonl(args->createmode);
-	if (args->createmode == NFS3_CREATE_EXCLUSIVE) {
-		*p++ = args->verifier[0];
-		*p++ = args->verifier[1];
-	} else
-		p = xdr_encode_sattr(p, args->sattr);
+	encode_uint32(xdr, args->createmode);
+	switch (args->createmode) {
+	case NFS3_CREATE_UNCHECKED:
+	case NFS3_CREATE_GUARDED:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NFS3_CREATE_EXCLUSIVE:
+		encode_createverf3(xdr, args->verifier);
+		break;
+	default:
+		BUG();
+	}
+}
 
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_createargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_createhow3(xdr, args);
 }
 
 /*
- * Encode MKDIR arguments
+ * 3.3.9  MKDIR3args
+ *
+ *	struct MKDIR3args {
+ *		diropargs3	where;
+ *		sattr3		attributes;
+ *	};
  */
-static int
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mkdirargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name, args->len);
-	p = xdr_encode_sattr(p, args->sattr);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_sattr3(xdr, args->sattr);
 }
 
 /*
- * Encode SYMLINK arguments
+ * 3.3.10  SYMLINK3args
+ *
+ *	struct symlinkdata3 {
+ *		sattr3		symlink_attributes;
+ *		nfspath3	symlink_data;
+ *	};
+ *
+ *	struct SYMLINK3args {
+ *		diropargs3	where;
+ *		symlinkdata3	symlink;
+ *	};
  */
-static int
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+				const struct nfs3_symlinkargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	p = xdr_encode_sattr(p, args->sattr);
-	*p++ = htonl(args->pathlen);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	encode_sattr3(xdr, args->sattr);
+	encode_nfspath3(xdr, args->pages, args->pathlen);
+}
 
-	/* Copy the page */
-	xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
-	return 0;
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_symlinkargs *args)
+{
+	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+	encode_symlinkdata3(xdr, args);
 }
 
 /*
- * Encode MKNOD arguments
+ * 3.3.11  MKNOD3args
+ *
+ *	struct devicedata3 {
+ *		sattr3		dev_attributes;
+ *		specdata3	spec;
+ *	};
+ *
+ *	union mknoddata3 switch (ftype3 type) {
+ *	case NF3CHR:
+ *	case NF3BLK:
+ *		devicedata3	device;
+ *	case NF3SOCK:
+ *	case NF3FIFO:
+ *		sattr3		pipe_attributes;
+ *	default:
+ *		void;
+ *	};
+ *
+ *	struct MKNOD3args {
+ *		diropargs3	where;
+ *		mknoddata3	what;
+ *	};
  */
-static int
-nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
-{
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_array(p, args->name, args->len);
-	*p++ = htonl(args->type);
-	p = xdr_encode_sattr(p, args->sattr);
-	if (args->type == NF3CHR || args->type == NF3BLK) {
-		*p++ = htonl(MAJOR(args->rdev));
-		*p++ = htonl(MINOR(args->rdev));
+static void encode_devicedata3(struct xdr_stream *xdr,
+			       const struct nfs3_mknodargs *args)
+{
+	encode_sattr3(xdr, args->sattr);
+	encode_specdata3(xdr, args->rdev);
+}
+
+static void encode_mknoddata3(struct xdr_stream *xdr,
+			      const struct nfs3_mknodargs *args)
+{
+	encode_ftype3(xdr, args->type);
+	switch (args->type) {
+	case NF3CHR:
+	case NF3BLK:
+		encode_devicedata3(xdr, args);
+		break;
+	case NF3SOCK:
+	case NF3FIFO:
+		encode_sattr3(xdr, args->sattr);
+		break;
+	case NF3REG:
+	case NF3DIR:
+		break;
+	default:
+		BUG();
 	}
+}
 
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    const struct nfs3_mknodargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name, args->len);
+	encode_mknoddata3(xdr, args);
 }
 
 /*
- * Encode RENAME arguments
+ * 3.3.12  REMOVE3args
+ *
+ *	struct REMOVE3args {
+ *		diropargs3  object;
+ *	};
  */
-static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
-{
-	p = xdr_encode_fhandle(p, args->old_dir);
-	p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
-	p = xdr_encode_fhandle(p, args->new_dir);
-	p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_removeargs *args)
+{
+	encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
 }
 
 /*
- * Encode LINK arguments
+ * 3.3.14  RENAME3args
+ *
+ *	struct RENAME3args {
+ *		diropargs3	from;
+ *		diropargs3	to;
+ *	};
  */
-static int
-nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_renameargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_fhandle(p, args->tofh);
-	p = xdr_encode_array(p, args->toname, args->tolen);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	const struct qstr *old = args->old_name;
+	const struct qstr *new = args->new_name;
+
+	encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+	encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
 
 /*
- * Encode arguments to readdir call
+ * 3.3.15  LINK3args
+ *
+ *	struct LINK3args {
+ *		nfs_fh3		file;
+ *		diropargs3	link;
+ *	};
  */
-static int
-nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfs3_linkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
-	u32 count = args->count;
-
-	p = xdr_encode_fhandle(p, args->fh);
-	p = xdr_encode_hyper(p, args->cookie);
-	*p++ = args->verf[0];
-	*p++ = args->verf[1];
-	if (args->plus) {
-		/* readdirplus: need dircount + buffer size.
-		 * We just make sure we make dircount big enough */
-		*p++ = htonl(count >> 3);
-	}
-	*p++ = htonl(count);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-
-	/* Inline the page array */
-	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
-	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
-	return 0;
+	encode_nfs_fh3(xdr, args->fromfh);
+	encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
 }
 
 /*
- * Decode the result of a readdir call.
- * We just check for syntactical correctness.
+ * 3.3.16  READDIR3args
+ *
+ *	struct READDIR3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		count;
+ *	};
  */
-static int
-nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
+static void encode_readdir3args(struct xdr_stream *xdr,
+				const struct nfs3_readdirargs *args)
 {
-	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
-	struct kvec *iov = rcvbuf->head;
-	struct page **page;
-	size_t hdrlen;
-	u32 recvd, pglen;
-	int status;
-
-	status = ntohl(*p++);
-	/* Decode post_op_attrs */
-	p = xdr_decode_post_op_attr(p, res->dir_attr);
-	if (status)
-		return nfs_stat_to_errno(status);
-	/* Decode verifier cookie */
-	if (res->verf) {
-		res->verf[0] = *p++;
-		res->verf[1] = *p++;
-	} else {
-		p += 2;
-	}
+	__be32 *p;
 
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-	if (iov->iov_len < hdrlen) {
-		dprintk("NFS: READDIR reply header overflowed:"
-				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
-		return -errno_NFSERR_IO;
-	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-	}
+	encode_nfs_fh3(xdr, args->fh);
 
-	pglen = rcvbuf->page_len;
-	recvd = rcvbuf->len - hdrlen;
-	if (pglen > recvd)
-		pglen = recvd;
-	page = rcvbuf->pages;
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
+	*p = cpu_to_be32(args->count);
+}
 
-	return pglen;
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      const struct nfs3_readdirargs *args)
+{
+	encode_readdir3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
 }
 
-__be32 *
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+/*
+ * 3.3.17  READDIRPLUS3args
+ *
+ *	struct READDIRPLUS3args {
+ *		nfs_fh3		dir;
+ *		cookie3		cookie;
+ *		cookieverf3	cookieverf;
+ *		count3		dircount;
+ *		count3		maxcount;
+ *	};
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+				    const struct nfs3_readdirargs *args)
 {
 	__be32 *p;
-	struct nfs_entry old = *entry;
-
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
-		goto out_overflow;
-	if (!ntohl(*p++)) {
-		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
-			goto out_overflow;
-		if (!ntohl(*p++))
-			return ERR_PTR(-EAGAIN);
-		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
-	}
 
-	p = xdr_inline_decode(xdr, 12);
-	if (unlikely(!p))
-		goto out_overflow;
-	p = xdr_decode_hyper(p, &entry->ino);
-	entry->len  = ntohl(*p++);
+	encode_nfs_fh3(xdr, args->fh);
 
-	p = xdr_inline_decode(xdr, entry->len + 8);
-	if (unlikely(!p))
-		goto out_overflow;
-	entry->name = (const char *) p;
-	p += XDR_QUADLEN(entry->len);
-	entry->prev_cookie = entry->cookie;
-	p = xdr_decode_hyper(p, &entry->cookie);
-
-	entry->d_type = DT_UNKNOWN;
-	if (plus) {
-		entry->fattr->valid = 0;
-		p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
-		if (IS_ERR(p))
-			goto out_overflow_exit;
-		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-		/* In fact, a post_op_fh3: */
-		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
-			goto out_overflow;
-		if (*p++) {
-			p = xdr_decode_fhandle_stream(xdr, entry->fh);
-			if (IS_ERR(p))
-				goto out_overflow_exit;
-			/* Ugh -- server reply was truncated */
-			if (p == NULL) {
-				dprintk("NFS: FH truncated\n");
-				*entry = old;
-				return ERR_PTR(-EAGAIN);
-			}
-		} else
-			memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
-	}
+	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
+	p = xdr_encode_cookie3(p, args->cookie);
+	p = xdr_encode_cookieverf3(p, args->verf);
 
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
+	/*
+	 * readdirplus: need dircount + buffer size.
+	 * We just make sure we make dircount big enough
+	 */
+	*p++ = cpu_to_be32(args->count >> 3);
 
-	return p;
+	*p = cpu_to_be32(args->count);
+}
 
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-out_overflow_exit:
-	return ERR_PTR(-EAGAIN);
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+					  const struct nfs3_readdirargs *args)
+{
+	encode_readdirplus3args(xdr, args);
+	prepare_reply_buffer(req, args->pages, 0,
+				args->count, NFS3_readdirres_sz);
 }
 
 /*
- * Encode COMMIT arguments
+ * 3.3.21  COMMIT3args
+ *
+ *	struct COMMIT3args {
+ *		nfs_fh3		file;
+ *		offset3		offset;
+ *		count3		count;
+ *	};
  */
-static int
-nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void encode_commit3args(struct xdr_stream *xdr,
+			       const struct nfs_writeargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fh);
+	__be32 *p;
+
+	encode_nfs_fh3(xdr, args->fh);
+
+	p = xdr_reserve_space(xdr, 8 + 4);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = htonl(args->count);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	return 0;
+	*p = cpu_to_be32(args->count);
 }
 
-#ifdef CONFIG_NFS_V3_ACL
-/*
- * Encode GETACL arguments
- */
-static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
-		    struct nfs3_getaclargs *args)
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs_writeargs *args)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
+	encode_commit3args(xdr, args);
+}
 
-	p = xdr_encode_fhandle(p, args->fh);
-	*p++ = htonl(args->mask);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+#ifdef CONFIG_NFS_V3_ACL
 
-	if (args->mask & (NFS_ACL | NFS_DFACL)) {
-		/* Inline the page array */
-		replen = (RPC_REPHDRSIZE + auth->au_rslack +
-			  ACL3_getaclres_sz) << 2;
-		xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
-				 NFSACL_MAXPAGES << PAGE_SHIFT);
-	}
-	return 0;
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_getaclargs *args)
+{
+	encode_nfs_fh3(xdr, args->fh);
+	encode_uint32(xdr, args->mask);
+	if (args->mask & (NFS_ACL | NFS_DFACL))
+		prepare_reply_buffer(req, args->pages, 0,
+					NFSACL_MAXPAGES << PAGE_SHIFT,
+					ACL3_getaclres_sz);
 }
 
-/*
- * Encode SETACL arguments
- */
-static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_setaclargs *args)
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs3_setaclargs *args)
 {
-	struct xdr_buf *buf = &req->rq_snd_buf;
 	unsigned int base;
-	int err;
-
-	p = xdr_encode_fhandle(p, NFS_FH(args->inode));
-	*p++ = htonl(args->mask);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-	base = req->rq_slen;
+	int error;
 
+	encode_nfs_fh3(xdr, NFS_FH(args->inode));
+	encode_uint32(xdr, args->mask);
 	if (args->npages != 0)
-		xdr_encode_pages(buf, args->pages, 0, args->len);
-	else
-		req->rq_slen = xdr_adjust_iovec(req->rq_svec,
-				p + XDR_QUADLEN(args->len));
+		xdr_write_pages(xdr, args->pages, 0, args->len);
 
-	err = nfsacl_encode(buf, base, args->inode,
+	base = req->rq_slen;
+	error = nfsacl_encode(xdr->buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
 			    args->acl_access : NULL, 1, 0);
-	if (err > 0)
-		err = nfsacl_encode(buf, base + err, args->inode,
-				    (args->mask & NFS_DFACL) ?
-				    args->acl_default : NULL, 1,
-				    NFS_ACL_DEFAULT);
-	return (err > 0) ? 0 : err;
+	BUG_ON(error < 0);
+	error = nfsacl_encode(xdr->buf, base + error, args->inode,
+			    (args->mask & NFS_DFACL) ?
+			    args->acl_default : NULL, 1,
+			    NFS_ACL_DEFAULT);
+	BUG_ON(error < 0);
 }
+
 #endif  /* CONFIG_NFS_V3_ACL */
 
 /*
- * NFS XDR decode functions
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
  */
 
 /*
- * Decode attrstat reply.
+ * 3.3.1  GETATTR3res
+ *
+ *	struct GETATTR3resok {
+ *		fattr3		obj_attributes;
+ *	};
+ *
+ *	union GETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		GETATTR3resok  resok;
+ *	default:
+ *		void;
+ *	};
  */
-static int
-nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs_fattr *result)
 {
-	int	status;
-
-	if ((status = ntohl(*p++)))
-		return nfs_stat_to_errno(status);
-	xdr_decode_fattr(p, fattr);
-	return 0;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_fattr3(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode status+wcc_data reply
- * SATTR, REMOVE, RMDIR
+ * 3.3.2  SETATTR3res
+ *
+ *	struct SETATTR3resok {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	struct SETATTR3resfail {
+ *		wcc_data  obj_wcc;
+ *	};
+ *
+ *	union SETATTR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		SETATTR3resok   resok;
+ *	default:
+ *		SETATTR3resfail resfail;
+ *	};
  */
-static int
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs_fattr *result)
 {
-	int	status;
-
-	if ((status = ntohl(*p++)))
-		status = nfs_stat_to_errno(status);
-	xdr_decode_wcc_data(p, fattr);
-	return status;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
-static int
-nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+/*
+ * 3.3.3  LOOKUP3res
+ *
+ *	struct LOOKUP3resok {
+ *		nfs_fh3		object;
+ *		post_op_attr	obj_attributes;
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	struct LOOKUP3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union LOOKUP3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LOOKUP3resok	resok;
+ *	default:
+ *		LOOKUP3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_diropres *result)
 {
-	return nfs3_xdr_wccstat(req, p, res->dir_attr);
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfs_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode LOOKUP reply
+ * 3.3.4  ACCESS3res
+ *
+ *	struct ACCESS3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		access;
+ *	};
+ *
+ *	struct ACCESS3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union ACCESS3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		ACCESS3resok	resok;
+ *	default:
+ *		ACCESS3resfail	resfail;
+ *	};
  */
-static int
-nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_accessres *result)
 {
-	int	status;
-
-	if ((status = ntohl(*p++))) {
-		status = nfs_stat_to_errno(status);
-	} else {
-		if (!(p = xdr_decode_fhandle(p, res->fh)))
-			return -errno_NFSERR_IO;
-		p = xdr_decode_post_op_attr(p, res->fattr);
-	}
-	xdr_decode_post_op_attr(p, res->dir_attr);
-	return status;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_uint32(xdr, &result->access);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode ACCESS reply
+ * 3.3.5  READLINK3res
+ *
+ *	struct READLINK3resok {
+ *		post_op_attr	symlink_attributes;
+ *		nfspath3	data;
+ *	};
+ *
+ *	struct READLINK3resfail {
+ *		post_op_attr	symlink_attributes;
+ *	};
+ *
+ *	union READLINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READLINK3resok	resok;
+ *	default:
+ *		READLINK3resfail resfail;
+ *	};
  */
-static int
-nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_fattr *result)
 {
-	int	status = ntohl(*p++);
-
-	p = xdr_decode_post_op_attr(p, res->fattr);
-	if (status)
-		return nfs_stat_to_errno(status);
-	res->access = ntohl(*p++);
-	return 0;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_nfspath3(xdr);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
 
-static int
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
+/*
+ * 3.3.6  READ3res
+ *
+ *	struct READ3resok {
+ *		post_op_attr	file_attributes;
+ *		count3		count;
+ *		bool		eof;
+ *		opaque		data<>;
+ *	};
+ *
+ *	struct READ3resfail {
+ *		post_op_attr	file_attributes;
+ *	};
+ *
+ *	union READ3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READ3resok	resok;
+ *	default:
+ *		READ3resfail	resfail;
+ *	};
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+			     struct nfs_readres *result)
 {
-	struct rpc_auth	*auth = req->rq_cred->cr_auth;
-	unsigned int replen;
+	u32 eof, count, ocount, recvd;
+	size_t hdrlen;
+	__be32 *p;
 
-	p = xdr_encode_fhandle(p, args->fh);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	p = xdr_inline_decode(xdr, 4 + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	count = be32_to_cpup(p++);
+	eof = be32_to_cpup(p++);
+	ocount = be32_to_cpup(p++);
+	if (unlikely(ocount != count))
+		goto out_mismatch;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(count > recvd))
+		goto out_cheating;
+
+out:
+	xdr_read_pages(xdr, count);
+	result->eof = eof;
+	result->count = count;
+	return count;
+out_mismatch:
+	dprintk("NFS: READ count doesn't match length of opaque: "
+		"count %u != ocount %u\n", count, ocount);
+	return -EIO;
+out_cheating:
+	dprintk("NFS: server cheating in read result: "
+		"count %u > recvd %u\n", count, recvd);
+	count = recvd;
+	eof = 0;
+	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
 
-	/* Inline the page array */
-	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2;
-	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
-	return 0;
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_readres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_read3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode READLINK reply
+ * 3.3.7  WRITE3res
+ *
+ *	enum stable_how {
+ *		UNSTABLE  = 0,
+ *		DATA_SYNC = 1,
+ *		FILE_SYNC = 2
+ *	};
+ *
+ *	struct WRITE3resok {
+ *		wcc_data	file_wcc;
+ *		count3		count;
+ *		stable_how	committed;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct WRITE3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union WRITE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		WRITE3resok	resok;
+ *	default:
+ *		WRITE3resfail	resfail;
+ *	};
  */
-static int
-nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+static int decode_write3resok(struct xdr_stream *xdr,
+			      struct nfs_writeres *result)
 {
-	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
-	struct kvec *iov = rcvbuf->head;
-	size_t hdrlen;
-	u32 len, recvd;
-	int	status;
-
-	status = ntohl(*p++);
-	p = xdr_decode_post_op_attr(p, fattr);
-
-	if (status != 0)
-		return nfs_stat_to_errno(status);
-
-	/* Convert length of symlink */
-	len = ntohl(*p++);
-	if (len >= rcvbuf->page_len) {
-		dprintk("nfs: server returned giant symlink!\n");
-		return -ENAMETOOLONG;
-	}
+	__be32 *p;
 
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-	if (iov->iov_len < hdrlen) {
-		dprintk("NFS: READLINK reply header overflowed:"
-				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
-		return -errno_NFSERR_IO;
-	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READLINK header is short. "
-			"iovec will be shifted.\n");
-		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-	}
-	recvd = req->rq_rcv_buf.len - hdrlen;
-	if (recvd < len) {
-		dprintk("NFS: server cheating in readlink reply: "
-				"count %u > recvd %u\n", len, recvd);
-		return -EIO;
-	}
+	p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->count = be32_to_cpup(p++);
+	result->verf->committed = be32_to_cpup(p++);
+	if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+		goto out_badvalue;
+	memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
+	return result->count;
+out_badvalue:
+	dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
 
-	xdr_terminate_string(rcvbuf, len);
-	return 0;
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs_writeres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_write3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode READ reply
+ * 3.3.8  CREATE3res
+ *
+ *	struct CREATE3resok {
+ *		post_op_fh3	obj;
+ *		post_op_attr	obj_attributes;
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	struct CREATE3resfail {
+ *		wcc_data	dir_wcc;
+ *	};
+ *
+ *	union CREATE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		CREATE3resok	resok;
+ *	default:
+ *		CREATE3resfail	resfail;
+ *	};
  */
-static int
-nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+static int decode_create3resok(struct xdr_stream *xdr,
+			       struct nfs3_diropres *result)
 {
-	struct kvec *iov = req->rq_rcv_buf.head;
-	size_t hdrlen;
-	u32 count, ocount, recvd;
-	int status;
+	int error;
+
+	error = decode_post_op_fh3(xdr, result->fh);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	/* The server isn't required to return a file handle.
+	 * If it didn't, force the client to perform a LOOKUP
+	 * to determine the correct file handle and attribute
+	 * values for the new object. */
+	if (result->fh->size == 0)
+		result->fattr->valid = 0;
+	error = decode_wcc_data(xdr, result->dir_attr);
+out:
+	return error;
+}
 
-	status = ntohl(*p++);
-	p = xdr_decode_post_op_attr(p, res->fattr);
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_diropres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_create3resok(xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
+}
 
-	if (status != 0)
-		return nfs_stat_to_errno(status);
+/*
+ * 3.3.12  REMOVE3res
+ *
+ *	struct REMOVE3resok {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	struct REMOVE3resfail {
+ *		wcc_data    dir_wcc;
+ *	};
+ *
+ *	union REMOVE3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		REMOVE3resok   resok;
+ *	default:
+ *		REMOVE3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_removeres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
 
-	/* Decode reply count and EOF flag. NFSv3 is somewhat redundant
-	 * in that it puts the count both in the res struct and in the
-	 * opaque data count. */
-	count    = ntohl(*p++);
-	res->eof = ntohl(*p++);
-	ocount   = ntohl(*p++);
+/*
+ * 3.3.14  RENAME3res
+ *
+ *	struct RENAME3resok {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	struct RENAME3resfail {
+ *		wcc_data	fromdir_wcc;
+ *		wcc_data	todir_wcc;
+ *	};
+ *
+ *	union RENAME3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		RENAME3resok   resok;
+ *	default:
+ *		RENAME3resfail resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_renameres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->old_fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->new_fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
 
-	if (ocount != count) {
-		dprintk("NFS: READ count doesn't match RPC opaque count.\n");
-		return -errno_NFSERR_IO;
-	}
+/*
+ * 3.3.15  LINK3res
+ *
+ *	struct LINK3resok {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	struct LINK3resfail {
+ *		post_op_attr	file_attributes;
+ *		wcc_data	linkdir_wcc;
+ *	};
+ *
+ *	union LINK3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		LINK3resok	resok;
+ *	default:
+ *		LINK3resfail	resfail;
+ *	};
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs3_linkres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
+}
 
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-	if (iov->iov_len < hdrlen) {
-		dprintk("NFS: READ reply header overflowed:"
-				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
-       		return -errno_NFSERR_IO;
-	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READ header is short. iovec will be shifted.\n");
-		xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-	}
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ *			the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *	struct entry3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		fhandle3	filehandle;
+ *		post_op_attr3	attributes;
+ *		entry3		*nextentry;
+ *	};
+ *
+ * 3.3.17  entryplus3
+ *	struct entryplus3 {
+ *		fileid3		fileid;
+ *		filename3	name;
+ *		cookie3		cookie;
+ *		post_op_attr	name_attributes;
+ *		post_op_fh3	name_handle;
+ *		entryplus3	*nextentry;
+ *	};
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
+{
+	struct nfs_entry old = *entry;
+	__be32 *p;
+	int error;
 
-	recvd = req->rq_rcv_buf.len - hdrlen;
-	if (count > recvd) {
-		dprintk("NFS: server cheating in read reply: "
-			"count %u > recvd %u\n", count, recvd);
-		count = recvd;
-		res->eof = 0;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	if (*p == xdr_zero) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p == xdr_zero)
+			return -EAGAIN;
+		entry->eof = 1;
+		return -EBADCOOKIE;
 	}
 
-	if (count < res->count)
-		res->count = count;
+	error = decode_fileid3(xdr, &entry->ino);
+	if (unlikely(error))
+		return error;
 
-	return count;
-}
+	error = decode_inline_filename3(xdr, &entry->name, &entry->len);
+	if (unlikely(error))
+		return error;
 
-/*
- * Decode WRITE response
- */
-static int
-nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
-{
-	int	status;
+	entry->prev_cookie = entry->cookie;
+	error = decode_cookie3(xdr, &entry->cookie);
+	if (unlikely(error))
+		return error;
 
-	status = ntohl(*p++);
-	p = xdr_decode_wcc_data(p, res->fattr);
+	entry->d_type = DT_UNKNOWN;
 
-	if (status != 0)
-		return nfs_stat_to_errno(status);
+	if (plus) {
+		entry->fattr->valid = 0;
+		error = decode_post_op_attr(xdr, entry->fattr);
+		if (unlikely(error))
+			return error;
+		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
-	res->count = ntohl(*p++);
-	res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
-	res->verf->verifier[0] = *p++;
-	res->verf->verifier[1] = *p++;
+		/* In fact, a post_op_fh3: */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(p == NULL))
+			goto out_overflow;
+		if (*p != xdr_zero) {
+			error = decode_nfs_fh3(xdr, entry->fh);
+			if (unlikely(error)) {
+				if (error == -E2BIG)
+					goto out_truncated;
+				return error;
+			}
+		} else
+			zero_nfs_fh3(entry->fh);
+	}
 
-	return res->count;
-}
+	return 0;
 
-/*
- * Decode a CREATE response
- */
-static int
-nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
-{
-	int	status;
-
-	status = ntohl(*p++);
-	if (status == 0) {
-		if (*p++) {
-			if (!(p = xdr_decode_fhandle(p, res->fh)))
-				return -errno_NFSERR_IO;
-			p = xdr_decode_post_op_attr(p, res->fattr);
-		} else {
-			memset(res->fh, 0, sizeof(*res->fh));
-			/* Do decode post_op_attr but set it to NULL */
-			p = xdr_decode_post_op_attr(p, res->fattr);
-			res->fattr->valid = 0;
-		}
-	} else {
-		status = nfs_stat_to_errno(status);
-	}
-	p = xdr_decode_wcc_data(p, res->dir_attr);
-	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EAGAIN;
+out_truncated:
+	dprintk("NFS: directory entry contains invalid file handle\n");
+	*entry = old;
+	return -EAGAIN;
 }
 
 /*
- * Decode RENAME reply
+ * 3.3.16  READDIR3res
+ *
+ *	struct dirlist3 {
+ *		entry3		*entries;
+ *		bool		eof;
+ *	};
+ *
+ *	struct READDIR3resok {
+ *		post_op_attr	dir_attributes;
+ *		cookieverf3	cookieverf;
+ *		dirlist3	reply;
+ *	};
+ *
+ *	struct READDIR3resfail {
+ *		post_op_attr	dir_attributes;
+ *	};
+ *
+ *	union READDIR3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		READDIR3resok	resok;
+ *	default:
+ *		READDIR3resfail	resfail;
+ *	};
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
  */
-static int
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
+static int decode_dirlist3(struct xdr_stream *xdr)
 {
-	int	status;
+	u32 recvd, pglen;
+	size_t hdrlen;
 
-	if ((status = ntohl(*p++)) != 0)
-		status = nfs_stat_to_errno(status);
-	p = xdr_decode_wcc_data(p, res->old_fattr);
-	p = xdr_decode_wcc_data(p, res->new_fattr);
-	return status;
+	pglen = xdr->buf->page_len;
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+	recvd = xdr->buf->len - hdrlen;
+	if (unlikely(pglen > recvd))
+		goto out_cheating;
+out:
+	xdr_read_pages(xdr, pglen);
+	return pglen;
+out_cheating:
+	dprintk("NFS: server cheating in readdir result: "
+		"pglen %u > recvd %u\n", pglen, recvd);
+	pglen = recvd;
+	goto out;
 }
 
-/*
- * Decode LINK reply
- */
-static int
-nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
+static int decode_readdir3resok(struct xdr_stream *xdr,
+				struct nfs3_readdirres *result)
 {
-	int	status;
+	int error;
+
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	/* XXX: do we need to check if result->verf != NULL ? */
+	error = decode_cookieverf3(xdr, result->verf);
+	if (unlikely(error))
+		goto out;
+	error = decode_dirlist3(xdr);
+out:
+	return error;
+}
 
-	if ((status = ntohl(*p++)) != 0)
-		status = nfs_stat_to_errno(status);
-	p = xdr_decode_post_op_attr(p, res->fattr);
-	p = xdr_decode_wcc_data(p, res->dir_attr);
-	return status;
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs3_readdirres *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_readdir3resok(xdr, result);
+out:
+	return error;
+out_default:
+	error = decode_post_op_attr(xdr, result->dir_attr);
+	if (unlikely(error))
+		goto out;
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode FSSTAT reply
+ * 3.3.18  FSSTAT3res
+ *
+ *	struct FSSTAT3resok {
+ *		post_op_attr	obj_attributes;
+ *		size3		tbytes;
+ *		size3		fbytes;
+ *		size3		abytes;
+ *		size3		tfiles;
+ *		size3		ffiles;
+ *		size3		afiles;
+ *		uint32		invarsec;
+ *	};
+ *
+ *	struct FSSTAT3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSSTAT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSSTAT3resok	resok;
+ *	default:
+ *		FSSTAT3resfail	resfail;
+ *	};
  */
-static int
-nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
+static int decode_fsstat3resok(struct xdr_stream *xdr,
+			       struct nfs_fsstat *result)
 {
-	int		status;
-
-	status = ntohl(*p++);
-
-	p = xdr_decode_post_op_attr(p, res->fattr);
-	if (status != 0)
-		return nfs_stat_to_errno(status);
-
-	p = xdr_decode_hyper(p, &res->tbytes);
-	p = xdr_decode_hyper(p, &res->fbytes);
-	p = xdr_decode_hyper(p, &res->abytes);
-	p = xdr_decode_hyper(p, &res->tfiles);
-	p = xdr_decode_hyper(p, &res->ffiles);
-	p = xdr_decode_hyper(p, &res->afiles);
+	__be32 *p;
 
+	p = xdr_inline_decode(xdr, 8 * 6 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	p = xdr_decode_size3(p, &result->tbytes);
+	p = xdr_decode_size3(p, &result->fbytes);
+	p = xdr_decode_size3(p, &result->abytes);
+	p = xdr_decode_size3(p, &result->tfiles);
+	p = xdr_decode_size3(p, &result->ffiles);
+	xdr_decode_size3(p, &result->afiles);
 	/* ignore invarsec */
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fsstat *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsstat3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode FSINFO reply
+ * 3.3.19  FSINFO3res
+ *
+ *	struct FSINFO3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		rtmax;
+ *		uint32		rtpref;
+ *		uint32		rtmult;
+ *		uint32		wtmax;
+ *		uint32		wtpref;
+ *		uint32		wtmult;
+ *		uint32		dtpref;
+ *		size3		maxfilesize;
+ *		nfstime3	time_delta;
+ *		uint32		properties;
+ *	};
+ *
+ *	struct FSINFO3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union FSINFO3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		FSINFO3resok	resok;
+ *	default:
+ *		FSINFO3resfail	resfail;
+ *	};
  */
-static int
-nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+			       struct nfs_fsinfo *result)
 {
-	int		status;
-
-	status = ntohl(*p++);
-
-	p = xdr_decode_post_op_attr(p, res->fattr);
-	if (status != 0)
-		return nfs_stat_to_errno(status);
+	__be32 *p;
 
-	res->rtmax  = ntohl(*p++);
-	res->rtpref = ntohl(*p++);
-	res->rtmult = ntohl(*p++);
-	res->wtmax  = ntohl(*p++);
-	res->wtpref = ntohl(*p++);
-	res->wtmult = ntohl(*p++);
-	res->dtpref = ntohl(*p++);
-	p = xdr_decode_hyper(p, &res->maxfilesize);
-	p = xdr_decode_time3(p, &res->time_delta);
+	p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->rtmax  = be32_to_cpup(p++);
+	result->rtpref = be32_to_cpup(p++);
+	result->rtmult = be32_to_cpup(p++);
+	result->wtmax  = be32_to_cpup(p++);
+	result->wtpref = be32_to_cpup(p++);
+	result->wtmult = be32_to_cpup(p++);
+	result->dtpref = be32_to_cpup(p++);
+	p = xdr_decode_size3(p, &result->maxfilesize);
+	xdr_decode_nfstime3(p, &result->time_delta);
 
 	/* ignore properties */
-	res->lease_time = 0;
+	result->lease_time = 0;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fsinfo *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_fsinfo3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode PATHCONF reply
+ * 3.3.20  PATHCONF3res
+ *
+ *	struct PATHCONF3resok {
+ *		post_op_attr	obj_attributes;
+ *		uint32		linkmax;
+ *		uint32		name_max;
+ *		bool		no_trunc;
+ *		bool		chown_restricted;
+ *		bool		case_insensitive;
+ *		bool		case_preserving;
+ *	};
+ *
+ *	struct PATHCONF3resfail {
+ *		post_op_attr	obj_attributes;
+ *	};
+ *
+ *	union PATHCONF3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		PATHCONF3resok	resok;
+ *	default:
+ *		PATHCONF3resfail resfail;
+ *	};
  */
-static int
-nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
+static int decode_pathconf3resok(struct xdr_stream *xdr,
+				 struct nfs_pathconf *result)
 {
-	int		status;
-
-	status = ntohl(*p++);
-
-	p = xdr_decode_post_op_attr(p, res->fattr);
-	if (status != 0)
-		return nfs_stat_to_errno(status);
-	res->max_link = ntohl(*p++);
-	res->max_namelen = ntohl(*p++);
+	__be32 *p;
 
+	p = xdr_inline_decode(xdr, 4 * 6);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	result->max_link = be32_to_cpup(p++);
+	result->max_namelen = be32_to_cpup(p);
 	/* ignore remaining fields */
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_pathconf *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_pathconf3resok(xdr, result);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
 /*
- * Decode COMMIT reply
+ * 3.3.21  COMMIT3res
+ *
+ *	struct COMMIT3resok {
+ *		wcc_data	file_wcc;
+ *		writeverf3	verf;
+ *	};
+ *
+ *	struct COMMIT3resfail {
+ *		wcc_data	file_wcc;
+ *	};
+ *
+ *	union COMMIT3res switch (nfsstat3 status) {
+ *	case NFS3_OK:
+ *		COMMIT3resok	resok;
+ *	default:
+ *		COMMIT3resfail	resfail;
+ *	};
  */
-static int
-nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_writeres *result)
 {
-	int		status;
-
-	status = ntohl(*p++);
-	p = xdr_decode_wcc_data(p, res->fattr);
-	if (status != 0)
-		return nfs_stat_to_errno(status);
-
-	res->verf->verifier[0] = *p++;
-	res->verf->verifier[1] = *p++;
-	return 0;
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	error = decode_wcc_data(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_status;
+	error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+	return error;
+out_status:
+	return nfs_stat_to_errno(status);
 }
 
 #ifdef CONFIG_NFS_V3_ACL
-/*
- * Decode GETACL reply
- */
-static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
-		   struct nfs3_getaclres *res)
+
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
+				      struct nfs3_getaclres *result)
 {
-	struct xdr_buf *buf = &req->rq_rcv_buf;
-	int status = ntohl(*p++);
 	struct posix_acl **acl;
 	unsigned int *aclcnt;
-	int err, base;
-
-	if (status != 0)
-		return nfs_stat_to_errno(status);
-	p = xdr_decode_post_op_attr(p, res->fattr);
-	res->mask = ntohl(*p++);
-	if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
-		return -EINVAL;
-	base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
-
-	acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
-	aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
-	err = nfsacl_decode(buf, base, aclcnt, acl);
-
-	acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
-	aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
-	if (err > 0)
-		err = nfsacl_decode(buf, base + err, aclcnt, acl);
-	return (err > 0) ? 0 : err;
+	size_t hdrlen;
+	int error;
+
+	error = decode_post_op_attr(xdr, result->fattr);
+	if (unlikely(error))
+		goto out;
+	error = decode_uint32(xdr, &result->mask);
+	if (unlikely(error))
+		goto out;
+	error = -EINVAL;
+	if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		goto out;
+
+	hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+
+	acl = NULL;
+	if (result->mask & NFS_ACL)
+		acl = &result->acl_access;
+	aclcnt = NULL;
+	if (result->mask & NFS_ACLCNT)
+		aclcnt = &result->acl_access_count;
+	error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+	if (unlikely(error <= 0))
+		goto out;
+
+	acl = NULL;
+	if (result->mask & NFS_DFACL)
+		acl = &result->acl_default;
+	aclcnt = NULL;
+	if (result->mask & NFS_DFACLCNT)
+		aclcnt = &result->acl_default_count;
+	error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+	if (unlikely(error <= 0))
+		return error;
+	error = 0;
+out:
+	return error;
 }
 
-/*
- * Decode setacl reply.
- */
-static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs3_getaclres *result)
 {
-	int status = ntohl(*p++);
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_getacl3resok(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
+}
 
-	if (status)
-		return nfs_stat_to_errno(status);
-	xdr_decode_post_op_attr(p, fattr);
-	return 0;
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs_fattr *result)
+{
+	enum nfs_stat status;
+	int error;
+
+	error = decode_nfsstat3(xdr, &status);
+	if (unlikely(error))
+		goto out;
+	if (status != NFS3_OK)
+		goto out_default;
+	error = decode_post_op_attr(xdr, result);
+out:
+	return error;
+out_default:
+	return nfs_stat_to_errno(status);
 }
+
 #endif  /* CONFIG_NFS_V3_ACL */
 
 #define PROC(proc, argtype, restype, timer)				\
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
-	.p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,			\
-	.p_decode    = (kxdrproc_t) nfs3_xdr_##restype,			\
-	.p_arglen    = NFS3_##argtype##_sz,				\
-	.p_replen    = NFS3_##restype##_sz,				\
+	.p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,	\
+	.p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,	\
+	.p_arglen    = NFS3_##argtype##args_sz,				\
+	.p_replen    = NFS3_##restype##res_sz,				\
 	.p_timer     = timer,						\
 	.p_statidx   = NFS3PROC_##proc,					\
 	.p_name      = #proc,						\
 	}
 
 struct rpc_procinfo	nfs3_procedures[] = {
-  PROC(GETATTR,		fhandle,	attrstat, 1),
-  PROC(SETATTR, 	sattrargs,	wccstat, 0),
-  PROC(LOOKUP,		diropargs,	lookupres, 2),
-  PROC(ACCESS,		accessargs,	accessres, 1),
-  PROC(READLINK,	readlinkargs,	readlinkres, 3),
-  PROC(READ,		readargs,	readres, 3),
-  PROC(WRITE,		writeargs,	writeres, 4),
-  PROC(CREATE,		createargs,	createres, 0),
-  PROC(MKDIR,		mkdirargs,	createres, 0),
-  PROC(SYMLINK,		symlinkargs,	createres, 0),
-  PROC(MKNOD,		mknodargs,	createres, 0),
-  PROC(REMOVE,		removeargs,	removeres, 0),
-  PROC(RMDIR,		diropargs,	wccstat, 0),
-  PROC(RENAME,		renameargs,	renameres, 0),
-  PROC(LINK,		linkargs,	linkres, 0),
-  PROC(READDIR,		readdirargs,	readdirres, 3),
-  PROC(READDIRPLUS,	readdirargs,	readdirres, 3),
-  PROC(FSSTAT,		fhandle,	fsstatres, 0),
-  PROC(FSINFO,  	fhandle,	fsinfores, 0),
-  PROC(PATHCONF,	fhandle,	pathconfres, 0),
-  PROC(COMMIT,		commitargs,	commitres, 5),
+	PROC(GETATTR,		getattr,	getattr,	1),
+	PROC(SETATTR,		setattr,	setattr,	0),
+	PROC(LOOKUP,		lookup,		lookup,		2),
+	PROC(ACCESS,		access,		access,		1),
+	PROC(READLINK,		readlink,	readlink,	3),
+	PROC(READ,		read,		read,		3),
+	PROC(WRITE,		write,		write,		4),
+	PROC(CREATE,		create,		create,		0),
+	PROC(MKDIR,		mkdir,		create,		0),
+	PROC(SYMLINK,		symlink,	create,		0),
+	PROC(MKNOD,		mknod,		create,		0),
+	PROC(REMOVE,		remove,		remove,		0),
+	PROC(RMDIR,		lookup,		setattr,	0),
+	PROC(RENAME,		rename,		rename,		0),
+	PROC(LINK,		link,		link,		0),
+	PROC(READDIR,		readdir,	readdir,	3),
+	PROC(READDIRPLUS,	readdirplus,	readdir,	3),
+	PROC(FSSTAT,		getattr,	fsstat,		0),
+	PROC(FSINFO,		getattr,	fsinfo,		0),
+	PROC(PATHCONF,		getattr,	pathconf,	0),
+	PROC(COMMIT,		commit,		commit,		5),
 };
 
 struct rpc_version		nfs_version3 = {
@@ -1185,8 +2468,8 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo	nfs3_acl_procedures[] = {
 	[ACLPROC3_GETACL] = {
 		.p_proc = ACLPROC3_GETACL,
-		.p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
-		.p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
 		.p_arglen = ACL3_getaclargs_sz,
 		.p_replen = ACL3_getaclres_sz,
 		.p_timer = 1,
@@ -1194,8 +2477,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 	},
 	[ACLPROC3_SETACL] = {
 		.p_proc = ACLPROC3_SETACL,
-		.p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
-		.p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+		.p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
+		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
 		.p_arglen = ACL3_setaclargs_sz,
 		.p_replen = ACL3_setaclres_sz,
 		.p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fd..7a747407314 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
 	NFS4CLNT_RECLAIM_REBOOT,
 	NFS4CLNT_RECLAIM_NOGRACE,
 	NFS4CLNT_DELEGRETURN,
+	NFS4CLNT_LAYOUTRECALL,
 	NFS4CLNT_SESSION_RESET,
 	NFS4CLNT_RECALL_SLOT,
 };
@@ -109,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
 	struct nfs_unique_id so_owner_id;
 	struct nfs_server    *so_server;
-	struct rb_node	     so_client_node;
+	struct rb_node	     so_server_node;
 
 	struct rpc_cred	     *so_cred;	 /* Associated cred */
 
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
 extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
 
-/* inode.c */
-extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
-
-
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
 extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
 
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d65..23f930caf1e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 {
 	struct nfs4_file_layout_dsaddr *dsaddr;
 	int status = -EINVAL;
-	struct nfs_server *nfss = NFS_SERVER(lo->inode);
+	struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
 
 	dprintk("--> %s\n", __func__);
 
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	/* find and reference the deviceid */
 	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
 	if (dsaddr == NULL) {
-		dsaddr = get_device_info(lo->inode, id);
+		dsaddr = get_device_info(lo->plh_inode, id);
 		if (dsaddr == NULL)
 			goto out;
 	}
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-	struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+	struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 
 	dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4435e5e1f90..9d992b0346e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -355,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 }
 
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
  */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
 	struct rpc_task *task;
 
@@ -371,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 	if (ses->fc_slot_table.highest_used_slotid != -1)
 		return;
 
-	dprintk("%s COMPLETE: Session Drained\n", __func__);
-	complete(&ses->complete);
+	dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
+	complete(&ses->fc_slot_table.complete);
+}
+
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+	    ses->bc_slot_table.highest_used_slotid != -1)
+		return;
+	dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+	complete(&ses->bc_slot_table.complete);
 }
 
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 
 	spin_lock(&tbl->slot_tbl_lock);
 	nfs4_free_slot(tbl, res->sr_slot);
-	nfs41_check_drain_session_complete(res->sr_session);
+	nfs4_check_drain_fc_complete(res->sr_session);
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;
 }
@@ -1826,6 +1839,8 @@ struct nfs4_closedata {
 	struct nfs_closeres res;
 	struct nfs_fattr fattr;
 	unsigned long timestamp;
+	bool roc;
+	u32 roc_barrier;
 };
 
 static void nfs4_free_closedata(void *data)
@@ -1833,6 +1848,8 @@ static void nfs4_free_closedata(void *data)
 	struct nfs4_closedata *calldata = data;
 	struct nfs4_state_owner *sp = calldata->state->owner;
 
+	if (calldata->roc)
+		pnfs_roc_release(calldata->state->inode);
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
@@ -1865,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	 */
 	switch (task->tk_status) {
 		case 0:
+			if (calldata->roc)
+				pnfs_roc_set_barrier(state->inode,
+						     calldata->roc_barrier);
 			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
 			renew_lease(server, calldata->timestamp);
 			nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		return;
 	}
 
-	if (calldata->arg.fmode == 0)
+	if (calldata->arg.fmode == 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+		if (calldata->roc &&
+		    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+			rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+				     task, NULL);
+			return;
+		}
+	}
 
 	nfs_fattr_init(calldata->res.fattr);
 	calldata->timestamp = jiffies;
@@ -1946,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
  *
  * NOTE: Caller must be holding the sp->so_owner semaphore!
  */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs4_closedata *calldata;
@@ -1981,11 +2008,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
+	calldata->roc = roc;
 	path_get(path);
 	calldata->path = *path;
 
-	msg.rpc_argp = &calldata->arg,
-	msg.rpc_resp = &calldata->res,
+	msg.rpc_argp = &calldata->arg;
+	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
@@ -1998,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
 	kfree(calldata);
 out:
+	if (roc)
+		pnfs_roc_release(state->inode);
 	nfs4_put_open_state(state);
 	nfs4_put_state_owner(sp);
 	return status;
@@ -2486,6 +2516,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		path = &ctx->path;
 		fmode = ctx->mode;
 	}
+	sattr->ia_mode &= ~current_umask();
 	state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
@@ -2816,6 +2847,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 {
 	struct nfs4_exception exception = { };
 	int err;
+
+	sattr->ia_mode &= ~current_umask();
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
 				_nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2916,6 +2949,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 {
 	struct nfs4_exception exception = { };
 	int err;
+
+	sattr->ia_mode &= ~current_umask();
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
 				_nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3478,6 +3513,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	struct nfs4_setclientid setclientid = {
 		.sc_verifier = &sc_verifier,
 		.sc_prog = program,
+		.sc_cb_ident = clp->cl_cb_ident,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3517,7 +3553,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 		if (signalled())
 			break;
 		if (loop++ & 1)
-			ssleep(clp->cl_lease_time + 1);
+			ssleep(clp->cl_lease_time / HZ + 1);
 		else
 			if (++clp->cl_id_uniquifier == 0)
 				break;
@@ -3663,8 +3699,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	data->rpc_status = 0;
 
 	task_setup_data.callback_data = data;
-	msg.rpc_argp = &data->args,
-	msg.rpc_resp = &data->res,
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -3743,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
 	arg.lock_owner.id = lsp->ls_id.id;
+	arg.lock_owner.s_dev = server->s_dev;
 	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	switch (status) {
 		case 0:
@@ -3908,8 +3945,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	msg.rpc_argp = &data->arg,
-	msg.rpc_resp = &data->res,
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
 	task_setup_data.callback_data = data;
 	return rpc_run_task(&task_setup_data);
 }
@@ -3988,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_stateid = &lsp->ls_stateid;
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
+	p->arg.lock_owner.s_dev = server->s_dev;
 	p->res.lock_seqid = p->arg.lock_seqid;
 	p->lsp = lsp;
 	p->server = server;
@@ -4145,8 +4183,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 			data->arg.reclaim = NFS_LOCK_RECLAIM;
 		task_setup_data.callback_ops = &nfs4_recover_lock_ops;
 	}
-	msg.rpc_argp = &data->arg,
-	msg.rpc_resp = &data->res,
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
 	task_setup_data.callback_data = data;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
@@ -4392,48 +4430,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
 		return;
 	args->lock_owner.clientid = server->nfs_client->cl_clientid;
 	args->lock_owner.id = lsp->ls_id.id;
+	args->lock_owner.s_dev = server->s_dev;
 	msg.rpc_argp = args;
 	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
 
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
-int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
-		size_t buflen, int flags)
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+				   const void *buf, size_t buflen,
+				   int flags, int type)
 {
-	struct inode *inode = dentry->d_inode;
-
-	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-		return -EOPNOTSUPP;
+	if (strcmp(key, "") != 0)
+		return -EINVAL;
 
-	return nfs4_proc_set_acl(inode, buf, buflen);
+	return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
 }
 
-/* The getxattr man page suggests returning -ENODATA for unknown attributes,
- * and that's what we'll do for e.g. user attributes that haven't been set.
- * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
- * attributes in kernel-managed attribute namespaces. */
-ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
-		size_t buflen)
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
+				   void *buf, size_t buflen, int type)
 {
-	struct inode *inode = dentry->d_inode;
-
-	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-		return -EOPNOTSUPP;
+	if (strcmp(key, "") != 0)
+		return -EINVAL;
 
-	return nfs4_proc_get_acl(inode, buf, buflen);
+	return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
 }
 
-ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+				       size_t list_len, const char *name,
+				       size_t name_len, int type)
 {
-	size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
+	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
 
 	if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
 		return 0;
-	if (buf && buflen < len)
-		return -ERANGE;
-	if (buf)
-		memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
+
+	if (list && len <= list_len)
+		memcpy(list, XATTR_NAME_NFSV4_ACL, len);
 	return len;
 }
 
@@ -4486,6 +4519,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 
 #ifdef CONFIG_NFS_V4_1
 /*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+	if (flags & ~EXCHGID4_FLAG_MASK_R)
+		goto out_inval;
+	if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+	    (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+		goto out_inval;
+	if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+		goto out_inval;
+	return NFS_OK;
+out_inval:
+	return -NFS4ERR_INVAL;
+}
+
+/*
  * nfs4_proc_exchange_id()
  *
  * Since the clientid has expired, all compounds using sessions
@@ -4498,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	nfs4_verifier verifier;
 	struct nfs41_exchange_id_args args = {
 		.client = clp,
-		.flags = clp->cl_exchange_flags,
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
 	};
 	struct nfs41_exchange_id_res res = {
 		.client = clp,
@@ -4515,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	dprintk("--> %s\n", __func__);
 	BUG_ON(clp == NULL);
 
-	/* Remove server-only flags */
-	args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
-
 	p = (u32 *)verifier.data;
 	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
 	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4543,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 			break;
 	}
 
+	status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
@@ -4776,17 +4826,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	if (!session)
 		return NULL;
 
-	init_completion(&session->complete);
-
 	tbl = &session->fc_slot_table;
 	tbl->highest_used_slotid = -1;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+	init_completion(&tbl->complete);
 
 	tbl = &session->bc_slot_table;
 	tbl->highest_used_slotid = -1;
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+	init_completion(&tbl->complete);
 
 	session->session_state = 1<<NFS4_SESSION_INITING;
 
@@ -5280,13 +5330,23 @@ static void
 nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
-	struct inode *ino = lgp->args.inode;
-	struct nfs_server *server = NFS_SERVER(ino);
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
 
 	dprintk("--> %s\n", __func__);
+	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
+	 * right now covering the LAYOUTGET we are about to send.
+	 * However, that is not so catastrophic, and there seems
+	 * to be no way to prevent it completely.
+	 */
 	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
 				&lgp->res.seq_res, 0, task))
 		return;
+	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+					  NFS_I(lgp->args.inode)->layout,
+					  lgp->args.ctx->state)) {
+		rpc_exit(task, NFS4_OK);
+		return;
+	}
 	rpc_call_start(task);
 }
 
@@ -5313,7 +5373,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 			return;
 		}
 	}
-	lgp->status = task->tk_status;
 	dprintk("<-- %s\n", __func__);
 }
 
@@ -5322,7 +5381,6 @@ static void nfs4_layoutget_release(void *calldata)
 	struct nfs4_layoutget *lgp = calldata;
 
 	dprintk("--> %s\n", __func__);
-	put_layout_hdr(lgp->args.inode);
 	if (lgp->res.layout.buf != NULL)
 		free_page((unsigned long) lgp->res.layout.buf);
 	put_nfs_open_context(lgp->args.ctx);
@@ -5367,13 +5425,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status != 0)
-		goto out;
-	status = lgp->status;
-	if (status != 0)
-		goto out;
-	status = pnfs_layout_process(lgp);
-out:
+	if (status == 0)
+		status = task->tk_status;
+	if (status == 0)
+		status = pnfs_layout_process(lgp);
 	rpc_put_task(task);
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
@@ -5504,9 +5559,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
-	.getxattr	= nfs4_getxattr,
-	.setxattr	= nfs4_setxattr,
-	.listxattr	= nfs4_listxattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
 };
 
 const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5551,6 +5607,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.open_context	= nfs4_atomic_open,
 };
 
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+	.prefix	= XATTR_NAME_NFSV4_ACL,
+	.list	= nfs4_xattr_list_nfs4_acl,
+	.get	= nfs4_xattr_get_nfs4_acl,
+	.set	= nfs4_xattr_set_nfs4_acl,
+};
+
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+	&nfs4_xattr_nfs4_acl_handler,
+	NULL
+};
+
 /*
  * Local variables:
  *  c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af1..402143d75fc 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
 
 	ops = clp->cl_mvops->state_renewal_ops;
 	dprintk("%s: start\n", __func__);
-	/* Are there any active superblocks? */
-	if (list_empty(&clp->cl_superblocks))
+
+	rcu_read_lock();
+	if (list_empty(&clp->cl_superblocks)) {
+		rcu_read_unlock();
 		goto out;
+	}
+	rcu_read_unlock();
+
 	spin_lock(&clp->cl_lock);
 	lease = clp->cl_lease_time;
 	last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
 		cred = ops->get_state_renewal_cred_locked(clp);
 		spin_unlock(&clp->cl_lock);
 		if (cred == NULL) {
-			if (list_empty(&clp->cl_delegations)) {
+			if (!nfs_delegations_present(clp)) {
 				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 				goto out;
 			}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a312673..2336d532cf6 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
 		put_rpccred(cred);
 }
 
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+	struct rpc_cred *cred = NULL;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
-	struct rpc_cred *cred = NULL;
 
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		if (list_empty(&sp->so_states))
 			continue;
 		cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 	return cred;
 }
 
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_renew_cred_server_locked(server);
+		if (cred != NULL)
+			break;
+	}
+	rcu_read_unlock();
+	return cred;
+}
+
 #if defined(CONFIG_NFS_V4_1)
 
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 	return status;
 }
 
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
 	struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 	}
 }
 
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-	struct nfs4_session *ses = clp->cl_session;
-	struct nfs4_slot_table *tbl = &ses->fc_slot_table;
-
 	spin_lock(&tbl->slot_tbl_lock);
-	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
 	if (tbl->highest_used_slotid != -1) {
-		INIT_COMPLETION(ses->complete);
+		INIT_COMPLETION(tbl->complete);
 		spin_unlock(&tbl->slot_tbl_lock);
-		return wait_for_completion_interruptible(&ses->complete);
+		return wait_for_completion_interruptible(&tbl->complete);
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 	return 0;
 }
 
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+	int ret = 0;
+
+	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+	/* back channel */
+	ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+	if (ret)
+		return ret;
+	/* fore channel */
+	return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
+
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
@@ -192,6 +232,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 	status = nfs4_proc_create_session(clp);
 	if (status != 0)
 		goto out;
+	status = nfs4_set_callback_sessionid(clp);
+	if (status != 0) {
+		printk(KERN_WARNING "Sessionid not set. No callback service\n");
+		nfs_callback_down(1);
+		status = 0;
+	}
 	nfs41_setup_state_renewal(clp);
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -210,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 
 #endif /* CONFIG_NFS_V4_1 */
 
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_cred *cred = NULL;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
+
+	spin_lock(&clp->cl_lock);
+	pos = rb_first(&server->state_owners);
+	if (pos != NULL) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+		cred = get_rpccred(sp->so_cred);
+	}
+	spin_unlock(&clp->cl_lock);
+	return cred;
+}
+
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+	struct nfs_server *server;
 	struct rpc_cred *cred;
 
 	spin_lock(&clp->cl_lock);
 	cred = nfs4_get_machine_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
 	if (cred != NULL)
 		goto out;
-	pos = rb_first(&clp->cl_state_owners);
-	if (pos != NULL) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-		cred = get_rpccred(sp->so_cred);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		cred = nfs4_get_setclientid_cred_server(server);
+		if (cred != NULL)
+			break;
 	}
+	rcu_read_unlock();
+
 out:
-	spin_unlock(&clp->cl_lock);
 	return cred;
 }
 
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
-		__u64 minval, int maxbits)
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
+				       struct nfs_unique_id *new,
+				       __u64 minval, int maxbits)
 {
 	struct rb_node **p, *parent;
 	struct nfs_unique_id *pos;
@@ -286,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-	struct nfs_client *clp = server->nfs_client;
-	struct rb_node **p = &clp->cl_state_owners.rb_node,
+	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
 	struct nfs4_state_owner *sp, *res = NULL;
 
 	while (*p != NULL) {
 		parent = *p;
-		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
 
 		if (server < sp->so_server) {
 			p = &parent->rb_left;
@@ -319,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-	struct rb_node **p = &clp->cl_state_owners.rb_node,
+	struct nfs_server *server = new->so_server;
+	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
 	struct nfs4_state_owner *sp;
 
 	while (*p != NULL) {
 		parent = *p;
-		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
 
-		if (new->so_server < sp->so_server) {
-			p = &parent->rb_left;
-			continue;
-		}
-		if (new->so_server > sp->so_server) {
-			p = &parent->rb_right;
-			continue;
-		}
 		if (new->so_cred < sp->so_cred)
 			p = &parent->rb_left;
 		else if (new->so_cred > sp->so_cred)
@@ -346,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
 			return sp;
 		}
 	}
-	nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
-	rb_link_node(&new->so_client_node, parent, p);
-	rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+	nfs_alloc_unique_id_locked(&server->openowner_id,
+					&new->so_owner_id, 1, 64);
+	rb_link_node(&new->so_server_node, parent, p);
+	rb_insert_color(&new->so_server_node, &server->state_owners);
 	return new;
 }
 
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-	if (!RB_EMPTY_NODE(&sp->so_client_node))
-		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-	nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+	struct nfs_server *server = sp->so_server;
+
+	if (!RB_EMPTY_NODE(&sp->so_server_node))
+		rb_erase(&sp->so_server_node, &server->state_owners);
+	nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 
 /*
@@ -386,23 +455,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-	if (!RB_EMPTY_NODE(&sp->so_client_node)) {
-		struct nfs_client *clp = sp->so_server->nfs_client;
+	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+		struct nfs_server *server = sp->so_server;
+		struct nfs_client *clp = server->nfs_client;
 
 		spin_lock(&clp->cl_lock);
-		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-		RB_CLEAR_NODE(&sp->so_client_node);
+		rb_erase(&sp->so_server_node, &server->state_owners);
+		RB_CLEAR_NODE(&sp->so_server_node);
 		spin_unlock(&clp->cl_lock);
 	}
 }
 
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+					      struct rpc_cred *cred)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp, *new;
 
 	spin_lock(&clp->cl_lock);
-	sp = nfs4_find_state_owner(server, cred);
+	sp = nfs4_find_state_owner_locked(server, cred);
 	spin_unlock(&clp->cl_lock);
 	if (sp != NULL)
 		return sp;
@@ -412,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	new->so_server = server;
 	new->so_cred = cred;
 	spin_lock(&clp->cl_lock);
-	sp = nfs4_insert_state_owner(clp, new);
+	sp = nfs4_insert_state_owner_locked(new);
 	spin_unlock(&clp->cl_lock);
 	if (sp == new)
 		get_rpccred(cred);
@@ -423,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	return sp;
 }
 
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
 	struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
-	nfs4_remove_state_owner(clp, sp);
+	nfs4_remove_state_owner_locked(sp);
 	spin_unlock(&clp->cl_lock);
 	rpc_destroy_wait_queue(&sp->so_sequence.wait);
 	put_rpccred(cred);
@@ -585,8 +668,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
 	if (!call_close) {
 		nfs4_put_open_state(state);
 		nfs4_put_state_owner(owner);
-	} else
-		nfs4_do_close(path, state, gfp_mask, wait);
+	} else {
+		bool roc = pnfs_roc(state->inode);
+
+		nfs4_do_close(path, state, gfp_mask, wait, roc);
+	}
 }
 
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
 	struct nfs4_lock_state *lsp;
-	struct nfs_client *clp = state->owner->so_server->nfs_client;
+	struct nfs_server *server = state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
 
 	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
 	if (lsp == NULL)
@@ -657,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 		return NULL;
 	}
 	spin_lock(&clp->cl_lock);
-	nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+	nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
 	spin_unlock(&clp->cl_lock);
 	INIT_LIST_HEAD(&lsp->ls_locks);
 	return lsp;
@@ -665,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-	struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
 
 	spin_lock(&clp->cl_lock);
-	nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+	nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
 	spin_unlock(&clp->cl_lock);
 	rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
 	kfree(lsp);
@@ -1114,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
 	}
 }
 
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 	struct nfs4_state *state;
 
-	/* Reset all sequence ids to zero */
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		sp->so_seqid.flags = 0;
 		spin_lock(&sp->so_lock);
 		list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
 		}
 		spin_unlock(&sp->so_lock);
 	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_reset_seqids(server, mark_reclaim);
+	rcu_read_unlock();
 }
 
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
 		(void)ops->reclaim_complete(clp);
 }
 
-static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 	struct nfs4_state *state;
 
-	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-		return 0;
-
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+	spin_lock(&clp->cl_lock);
+	for (pos = rb_first(&server->state_owners);
+	     pos != NULL;
+	     pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
 		spin_lock(&sp->so_lock);
 		list_for_each_entry(state, &sp->so_states, open_states) {
-			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+						&state->flags))
 				continue;
 			nfs4_state_mark_reclaim_nograce(clp, state);
 		}
 		spin_unlock(&sp->so_lock);
 	}
+	spin_unlock(&clp->cl_lock);
+}
+
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs4_clear_reclaim_server(server);
+	rcu_read_unlock();
 
 	nfs_delegation_reap_unclaimed(clp);
 	return 1;
@@ -1238,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+	struct nfs4_state_owner *sp;
+	struct nfs_server *server;
 	struct rb_node *pos;
 	int status = 0;
 
 restart:
-	spin_lock(&clp->cl_lock);
-	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
-		struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-		if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
-			continue;
-		atomic_inc(&sp->so_count);
-		spin_unlock(&clp->cl_lock);
-		status = nfs4_reclaim_open_state(sp, ops);
-		if (status < 0) {
-			set_bit(ops->owner_flag_bit, &sp->so_flags);
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		spin_lock(&clp->cl_lock);
+		for (pos = rb_first(&server->state_owners);
+		     pos != NULL;
+		     pos = rb_next(pos)) {
+			sp = rb_entry(pos,
+				struct nfs4_state_owner, so_server_node);
+			if (!test_and_clear_bit(ops->owner_flag_bit,
+							&sp->so_flags))
+				continue;
+			atomic_inc(&sp->so_count);
+			spin_unlock(&clp->cl_lock);
+			rcu_read_unlock();
+
+			status = nfs4_reclaim_open_state(sp, ops);
+			if (status < 0) {
+				set_bit(ops->owner_flag_bit, &sp->so_flags);
+				nfs4_put_state_owner(sp);
+				return nfs4_recovery_handle_error(clp, status);
+			}
+
 			nfs4_put_state_owner(sp);
-			return nfs4_recovery_handle_error(clp, status);
+			goto restart;
 		}
-		nfs4_put_state_owner(sp);
-		goto restart;
+		spin_unlock(&clp->cl_lock);
 	}
-	spin_unlock(&clp->cl_lock);
+	rcu_read_unlock();
 	return status;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1826b012e..2ab8e5cb8f5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
  * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
  */
-#define open_owner_id_maxsz	(1 + 4)
-#define lock_owner_id_maxsz	(1 + 4)
+#define open_owner_id_maxsz	(1 + 1 + 4)
+#define lock_owner_id_maxsz	(1 + 1 + 4)
 #define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
 	__be32 *p;
 
-	p = reserve_space(xdr, 28);
+	p = reserve_space(xdr, 32);
 	p = xdr_encode_hyper(p, lowner->clientid);
-	*p++ = cpu_to_be32(16);
+	*p++ = cpu_to_be32(20);
 	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+	*p++ = cpu_to_be32(lowner->s_dev);
 	xdr_encode_hyper(p, lowner->id);
 }
 
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	*p++ = cpu_to_be32(OP_OPEN);
 	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
-	p = reserve_space(xdr, 28);
+	p = reserve_space(xdr, 32);
 	p = xdr_encode_hyper(p, arg->clientid);
-	*p++ = cpu_to_be32(16);
+	*p++ = cpu_to_be32(20);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
+	*p++ = cpu_to_be32(arg->server->s_dev);
 	xdr_encode_hyper(p, arg->id);
 }
 
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_restorefh_maxsz;
 }
 
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
 	__be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
 	*p = cpu_to_be32(FATTR4_WORD0_ACL);
-	if (arg->acl_len % 4)
-		return -EINVAL;
+	BUG_ON(arg->acl_len % 4);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
-	return 0;
 }
 
 static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
 		      const struct nfs4_layoutget_args *args,
 		      struct compound_hdr *hdr)
 {
-	nfs4_stateid stateid;
 	__be32 *p;
 
 	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
 	p = xdr_encode_hyper(p, args->range.offset);
 	p = xdr_encode_hyper(p, args->range.length);
 	p = xdr_encode_hyper(p, args->minlength);
-	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
-				args->ctx->state);
-	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
 	*p = cpu_to_be32(args->maxcount);
 
 	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
  * Encode an ACCESS request
  */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_accessargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_access(&xdr, args->access, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_access(xdr, args->access, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode LOOKUP request
  */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_lookup_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_lookup(&xdr, args->name, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode LOOKUP_ROOT request
  */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_lookup_root_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putrootfh(&xdr, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode REMOVE request
  */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_removeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_remove(&xdr, &args->name, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_remove(xdr, &args->name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode RENAME request
  */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs_renameargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->old_dir, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_putfh(&xdr, args->new_dir, &hdr);
-	encode_rename(&xdr, args->old_name, args->new_name, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->old_dir, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->new_dir, &hdr);
+	encode_rename(xdr, args->old_name, args->new_name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode LINK request
  */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct nfs4_link_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_link(&xdr, args->name, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_link(xdr, args->name, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode CREATE request
  */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_create_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_create(&xdr, args, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_create(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode SYMLINK request
  */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_create_arg *args)
 {
-	return nfs4_xdr_enc_create(req, p, args);
+	nfs4_xdr_enc_create(req, xdr, args);
 }
 
 /*
  * Encode GETATTR request
  */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_getattr_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a CLOSE request
  */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_closeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_close(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_close(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN request
  */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_openargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_savefh(&xdr, &hdr);
-	encode_open(&xdr, args, &hdr);
-	encode_getfh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
-	encode_restorefh(&xdr, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_savefh(xdr, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_restorefh(xdr, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN_CONFIRM request
  */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs_open_confirmargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops   = 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_open_confirm(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_confirm(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN request with no attributes.
  */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs_openargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_open(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an OPEN_DOWNGRADE request
  */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs_closeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_open_downgrade(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_open_downgrade(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a LOCK request
  */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_lock_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_lock(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lock(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a LOCKT request
  */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_lockt_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_lockt(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_lockt(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a LOCKU request
  */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_locku_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_locku(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_locku(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+					   struct xdr_stream *xdr,
+					struct nfs_release_lockowner_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_release_lockowner(xdr, &args->lock_owner, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a READLINK request
  */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_readlink *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_readlink(&xdr, args, req, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readlink(xdr, args, req, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
 			args->pgbase, args->pglen);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a READDIR request
  */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 const struct nfs4_readdir_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_readdir(&xdr, args, req, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_readdir(xdr, args, req, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
 			 args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 			__func__, hdr.replen << 2, args->pages,
 			args->pgbase, args->count);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a READ request
  */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+			      struct nfs_readargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_read(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_read(xdr, args, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
 			 args->pages, args->pgbase, args->count);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode an SETATTR request
  */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 struct nfs_setattrargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_setattr(&xdr, args, args->server, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setattr(xdr, args, args->server, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a GETACL request
  */
-static int
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
-		struct nfs_getaclargs *args)
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_getaclargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 	uint32_t replen;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
 	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-	encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
 		args->acl_pages, args->acl_pgbase, args->acl_len);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode a WRITE request
  */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_write(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_write(xdr, args, &hdr);
 	req->rq_snd_buf.flags |= XDRBUF_WRITE;
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  *  a COMMIT request
  */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_writeargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_commit(&xdr, args, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_commit(xdr, args, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * FSINFO request
  */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs4_fsinfo_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_fsinfo(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_fsinfo(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a PATHCONF request
  */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  const struct nfs4_pathconf_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
 			   &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a STATFS request
  */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+				const struct nfs4_statfs_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
 			   args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * GETATTR_BITMAP request
  */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
-				    struct nfs4_server_caps_arg *args)
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_server_caps_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fhandle, &hdr);
-	encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
 			   FATTR4_WORD0_LINK_SUPPORT|
 			   FATTR4_WORD0_SYMLINK_SUPPORT|
 			   FATTR4_WORD0_ACLSUPPORT, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a RENEW request
  */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+			       struct nfs_client *clp)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_renew(&xdr, clp, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_renew(xdr, clp, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a SETCLIENTID request
  */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs4_setclientid *sc)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_setclientid(&xdr, sc, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid(xdr, sc, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a SETCLIENTID_CONFIRM request
  */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+					     struct xdr_stream *xdr,
+					     struct nfs4_setclientid_res *arg)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
 	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_setclientid_confirm(&xdr, arg, &hdr);
-	encode_putrootfh(&xdr, &hdr);
-	encode_fsinfo(&xdr, lease_bitmap, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_setclientid_confirm(xdr, arg, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * DELEGRETURN request
  */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_delegreturnargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fhandle, &hdr);
-	encode_delegreturn(&xdr, args->stateid, &hdr);
-	encode_getfattr(&xdr, args->bitmask, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fhandle, &hdr);
+	encode_delegreturn(xdr, args->stateid, &hdr);
+	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode FS_LOCATIONS request
  */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fs_locations_arg *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 	uint32_t replen;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->dir_fh, &hdr);
-	encode_lookup(&xdr, args->name, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->dir_fh, &hdr);
+	encode_lookup(xdr, args->name, &hdr);
 	replen = hdr.replen;	/* get the attribute into args->page */
-	encode_fs_locations(&xdr, args->bitmask, &hdr);
+	encode_fs_locations(xdr, args->bitmask, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
 			0, PAGE_SIZE);
 	encode_nops(&hdr);
-	return 0;
 }
 
 #if defined(CONFIG_NFS_V4_1)
 /*
  * EXCHANGE_ID request
  */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
-				    struct nfs41_exchange_id_args *args)
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs41_exchange_id_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_exchange_id(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_exchange_id(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a CREATE_SESSION request
  */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
-				       struct nfs41_create_session_args *args)
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs41_create_session_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_create_session(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_create_session(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a DESTROY_SESSION request
  */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
-					struct nfs4_session *session)
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs4_session *session)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = session->clp->cl_mvops->minor_version,
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_destroy_session(&xdr, session, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_session(xdr, session, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a SEQUENCE request
  */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
-				 struct nfs4_sequence_args *args)
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
+				  struct nfs4_sequence_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a GET_LEASE_TIME request
  */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
-				       struct nfs4_get_lease_time_args *args)
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					struct nfs4_get_lease_time_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
 	};
 	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->la_seq_args, &hdr);
-	encode_putrootfh(&xdr, &hdr);
-	encode_fsinfo(&xdr, lease_bitmap, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->la_seq_args, &hdr);
+	encode_putrootfh(xdr, &hdr);
+	encode_fsinfo(xdr, lease_bitmap, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * a RECLAIM_COMPLETE request
  */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
-				     struct nfs41_reclaim_complete_args *args)
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
+					  struct xdr_stream *xdr,
+				struct nfs41_reclaim_complete_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args)
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_reclaim_complete(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_reclaim_complete(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  * Encode GETDEVICEINFO request
  */
-static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
-				      struct nfs4_getdeviceinfo_args *args)
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+				       struct xdr_stream *xdr,
+				       struct nfs4_getdeviceinfo_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_getdeviceinfo(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_getdeviceinfo(xdr, args, &hdr);
 
 	/* set up reply kvec. Subtract notification bitmap max size (2)
 	 * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
 			 args->pdev->pglen);
 
 	encode_nops(&hdr);
-	return 0;
 }
 
 /*
  *  Encode LAYOUTGET request
  */
-static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
-				  struct nfs4_layoutget_args *args)
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   struct nfs4_layoutget_args *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-	encode_layoutget(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutget(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 		goto out_overflow;
 	eof = be32_to_cpup(p++);
 	count = be32_to_cpup(p);
-	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+	hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
 		dprintk("NFS: server cheating in read reply: "
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
 		goto out_overflow;
 	len = be32_to_cpup(p);
 	if (len) {
-		int i;
+		uint32_t i;
 
 		p = xdr_inline_decode(xdr, 4 * len);
 		if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
 /*
  * Decode OPEN_DOWNGRADE response
  */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
+				       struct nfs_closeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_open_downgrade(&xdr, res);
+	status = decode_open_downgrade(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5118,26 +5040,25 @@ out:
 /*
  * Decode ACCESS response
  */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_accessres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_access(&xdr, res);
+	status = decode_access(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5146,26 +5067,28 @@ out:
 /*
  * Decode LOOKUP response
  */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_lookup_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_lookup(&xdr)) != 0)
+	status = decode_lookup(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_getfh(&xdr, res->fh)) != 0)
+	status = decode_getfh(xdr, res->fh);
+	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server
+	status = decode_getfattr(xdr, res->fattr, res->server
 			,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5174,23 +5097,25 @@ out:
 /*
  * Decode LOOKUP_ROOT response
  */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_lookup_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putrootfh(&xdr)) != 0)
+	status = decode_putrootfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_getfh(&xdr, res->fh)) == 0)
-		status = decode_getfattr(&xdr, res->fattr, res->server,
+	status = decode_getfh(xdr, res->fh);
+	if (status == 0)
+		status = decode_getfattr(xdr, res->fattr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5199,24 +5124,25 @@ out:
 /*
  * Decode REMOVE response
  */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_removeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+	status = decode_remove(xdr, &res->cinfo);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server,
+	decode_getfattr(xdr, res->dir_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5225,34 +5151,38 @@ out:
 /*
  * Decode RENAME response
  */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_renameres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_savefh(&xdr)) != 0)
+	status = decode_savefh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+	if (status)
 		goto out;
 	/* Current FH is target directory */
-	if (decode_getfattr(&xdr, res->new_fattr, res->server,
+	if (decode_getfattr(xdr, res->new_fattr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if ((status = decode_restorefh(&xdr)) != 0)
+	status = decode_restorefh(xdr);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->old_fattr, res->server,
+	decode_getfattr(xdr, res->old_fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5261,37 +5191,41 @@ out:
 /*
  * Decode LINK response
  */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs4_link_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_savefh(&xdr)) != 0)
+	status = decode_savefh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+	status = decode_link(xdr, &res->cinfo);
+	if (status)
 		goto out;
 	/*
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(&xdr, res->dir_attr, res->server,
+	if (decode_getfattr(xdr, res->dir_attr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if ((status = decode_restorefh(&xdr)) != 0)
+	status = decode_restorefh(xdr);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5300,33 +5234,37 @@ out:
 /*
  * Decode CREATE response
  */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs4_create_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_savefh(&xdr)) != 0)
+	status = decode_savefh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+	status = decode_create(xdr, &res->dir_cinfo);
+	if (status)
 		goto out;
-	if ((status = decode_getfh(&xdr, res->fh)) != 0)
+	status = decode_getfh(xdr, res->fh);
+	if (status)
 		goto out;
-	if (decode_getfattr(&xdr, res->fattr, res->server,
+	if (decode_getfattr(xdr, res->fattr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if ((status = decode_restorefh(&xdr)) != 0)
+	status = decode_restorefh(xdr);
+	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->dir_fattr, res->server,
+	decode_getfattr(xdr, res->dir_fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5335,31 +5273,31 @@ out:
 /*
  * Decode SYMLINK response
  */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_create_res *res)
 {
-	return nfs4_xdr_dec_create(rqstp, p, res);
+	return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 
 /*
  * Decode GETATTR response
  */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_getattr_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server,
+	status = decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5368,46 +5306,40 @@ out:
 /*
  * Encode an SETACL request
  */
-static int
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+				struct nfs_setaclargs *args)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
-	int status;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_compound_hdr(&xdr, req, &hdr);
-	encode_sequence(&xdr, &args->seq_args, &hdr);
-	encode_putfh(&xdr, args->fh, &hdr);
-	status = encode_setacl(&xdr, args, &hdr);
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_setacl(xdr, args, &hdr);
 	encode_nops(&hdr);
-	return status;
 }
 
 /*
  * Decode SETACL response
  */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		    struct nfs_setaclres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_setattr(&xdr);
+	status = decode_setattr(xdr);
 out:
 	return status;
 }
@@ -5416,24 +5348,22 @@ out:
  * Decode GETACL response
  */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		    struct nfs_getaclres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getacl(&xdr, rqstp, &res->acl_len);
+	status = decode_getacl(xdr, rqstp, &res->acl_len);
 
 out:
 	return status;
@@ -5442,23 +5372,22 @@ out:
 /*
  * Decode CLOSE response
  */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_closeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_close(&xdr, res);
+	status = decode_close(xdr, res);
 	if (status != 0)
 		goto out;
 	/*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
 	 * 	an ESTALE error. Shouldn't be a problem,
 	 * 	though, since fattr->valid will remain unset.
 	 */
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5476,36 +5405,35 @@ out:
 /*
  * Decode OPEN response
  */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_openres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_savefh(&xdr);
+	status = decode_savefh(xdr);
 	if (status)
 		goto out;
-	status = decode_open(&xdr, res);
+	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	if (decode_getfh(&xdr, &res->fh) != 0)
+	if (decode_getfh(xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(&xdr, res->f_attr, res->server,
+	if (decode_getfattr(xdr, res->f_attr, res->server,
 				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
-	if (decode_restorefh(&xdr) != 0)
+	if (decode_restorefh(xdr) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server,
+	decode_getfattr(xdr, res->dir_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5514,20 +5442,20 @@ out:
 /*
  * Decode OPEN_CONFIRM response
  */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs_open_confirmres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_open_confirm(&xdr, res);
+	status = decode_open_confirm(xdr, res);
 out:
 	return status;
 }
@@ -5535,26 +5463,26 @@ out:
 /*
  * Decode OPEN response
  */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs_openres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_open(&xdr, res);
+	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->f_attr, res->server,
+	decode_getfattr(xdr, res->f_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5563,26 +5491,26 @@ out:
 /*
  * Decode SETATTR response
  */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct nfs_setattrres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_setattr(&xdr);
+	status = decode_setattr(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5591,23 +5519,22 @@ out:
 /*
  * Decode LOCK response
  */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_lock_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lock(&xdr, res);
+	status = decode_lock(xdr, res);
 out:
 	return status;
 }
@@ -5615,23 +5542,22 @@ out:
 /*
  * Decode LOCKT response
  */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_lockt_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lockt(&xdr, res);
+	status = decode_lockt(xdr, res);
 out:
 	return status;
 }
@@ -5639,61 +5565,58 @@ out:
 /*
  * Decode LOCKU response
  */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_locku_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_locku(&xdr, res);
+	status = decode_locku(xdr, res);
 out:
 	return status;
 }
 
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+					  struct xdr_stream *xdr, void *dummy)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_release_lockowner(&xdr);
+		status = decode_release_lockowner(xdr);
 	return status;
 }
 
 /*
  * Decode READLINK response
  */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
 				 struct nfs4_readlink_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_readlink(&xdr, rqstp);
+	status = decode_readlink(xdr, rqstp);
 out:
 	return status;
 }
@@ -5701,23 +5624,22 @@ out:
 /*
  * Decode READDIR response
  */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+				struct nfs4_readdir_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_readdir(&xdr, rqstp, res);
+	status = decode_readdir(xdr, rqstp, res);
 out:
 	return status;
 }
@@ -5725,23 +5647,22 @@ out:
 /*
  * Decode Read response
  */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			     struct nfs_readres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_read(&xdr, rqstp, res);
+	status = decode_read(xdr, rqstp, res);
 	if (!status)
 		status = res->count;
 out:
@@ -5751,26 +5672,25 @@ out:
 /*
  * Decode WRITE response
  */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      struct nfs_writeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_write(&xdr, res);
+	status = decode_write(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 	if (!status)
 		status = res->count;
@@ -5781,26 +5701,25 @@ out:
 /*
  * Decode COMMIT response
  */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			       struct nfs_writeres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_commit(&xdr, res);
+	status = decode_commit(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5809,85 +5728,80 @@ out:
 /*
  * Decode FSINFO response
  */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
 			       struct nfs4_fsinfo_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, req);
+		status = decode_sequence(xdr, &res->seq_res, req);
 	if (!status)
-		status = decode_putfh(&xdr);
+		status = decode_putfh(xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, res->fsinfo);
+		status = decode_fsinfo(xdr, res->fsinfo);
 	return status;
 }
 
 /*
  * Decode PATHCONF response
  */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
 				 struct nfs4_pathconf_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, req);
+		status = decode_sequence(xdr, &res->seq_res, req);
 	if (!status)
-		status = decode_putfh(&xdr);
+		status = decode_putfh(xdr);
 	if (!status)
-		status = decode_pathconf(&xdr, res->pathconf);
+		status = decode_pathconf(xdr, res->pathconf);
 	return status;
 }
 
 /*
  * Decode STATFS response
  */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
 			       struct nfs4_statfs_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, req);
+		status = decode_sequence(xdr, &res->seq_res, req);
 	if (!status)
-		status = decode_putfh(&xdr);
+		status = decode_putfh(xdr);
 	if (!status)
-		status = decode_statfs(&xdr, res->fsstat);
+		status = decode_statfs(xdr, res->fsstat);
 	return status;
 }
 
 /*
  * Decode GETATTR_BITMAP response
  */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_server_caps_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, req);
+	status = decode_sequence(xdr, &res->seq_res, req);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	status = decode_server_caps(&xdr, res);
+	status = decode_server_caps(xdr, res);
 out:
 	return status;
 }
@@ -5895,79 +5809,77 @@ out:
 /*
  * Decode RENEW response
  */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+			      void *__unused)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_renew(&xdr);
+		status = decode_renew(xdr);
 	return status;
 }
 
 /*
  * Decode SETCLIENTID response
  */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
-		struct nfs4_setclientid_res *res)
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
+				    struct xdr_stream *xdr,
+				    struct nfs4_setclientid_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_setclientid(&xdr, res);
+		status = decode_setclientid(xdr, res);
 	return status;
 }
 
 /*
  * Decode SETCLIENTID_CONFIRM response
  */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+					    struct xdr_stream *xdr,
+					    struct nfs_fsinfo *fsinfo)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_setclientid_confirm(&xdr);
+		status = decode_setclientid_confirm(xdr);
 	if (!status)
-		status = decode_putrootfh(&xdr);
+		status = decode_putrootfh(xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, fsinfo);
+		status = decode_fsinfo(xdr, fsinfo);
 	return status;
 }
 
 /*
  * Decode DELEGRETURN response
  */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs4_delegreturnres *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_delegreturn(&xdr);
+	status = decode_delegreturn(xdr);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server,
+	decode_getfattr(xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5976,26 +5888,27 @@ out:
 /*
  * Decode FS_LOCATIONS response
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
 				     struct nfs4_fs_locations_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, req);
+	status = decode_sequence(xdr, &res->seq_res, req);
 	if (status)
 		goto out;
-	if ((status = decode_putfh(&xdr)) != 0)
+	status = decode_putfh(xdr);
+	if (status)
 		goto out;
-	if ((status = decode_lookup(&xdr)) != 0)
+	status = decode_lookup(xdr);
+	if (status)
 		goto out;
-	xdr_enter_page(&xdr, PAGE_SIZE);
-	status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+	xdr_enter_page(xdr, PAGE_SIZE);
+	status = decode_getfattr(xdr, &res->fs_locations->fattr,
 				 res->fs_locations->server,
 				 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -6006,129 +5919,122 @@ out:
 /*
  * Decode EXCHANGE_ID response
  */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
 				    void *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_exchange_id(&xdr, res);
+		status = decode_exchange_id(xdr, res);
 	return status;
 }
 
 /*
  * Decode CREATE_SESSION response
  */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
 				       struct nfs41_create_session_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_create_session(&xdr, res);
+		status = decode_create_session(xdr, res);
 	return status;
 }
 
 /*
  * Decode DESTROY_SESSION response
  */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
-					void *dummy)
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_destroy_session(&xdr, dummy);
+		status = decode_destroy_session(xdr, res);
 	return status;
 }
 
 /*
  * Decode SEQUENCE response
  */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+				 struct xdr_stream *xdr,
 				 struct nfs4_sequence_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, res, rqstp);
+		status = decode_sequence(xdr, res, rqstp);
 	return status;
 }
 
 /*
  * Decode GET_LEASE_TIME response
  */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+				       struct xdr_stream *xdr,
 				       struct nfs4_get_lease_time_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+		status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
 	if (!status)
-		status = decode_putrootfh(&xdr);
+		status = decode_putrootfh(xdr);
 	if (!status)
-		status = decode_fsinfo(&xdr, res->lr_fsinfo);
+		status = decode_fsinfo(xdr, res->lr_fsinfo);
 	return status;
 }
 
 /*
  * Decode RECLAIM_COMPLETE response
  */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+					 struct xdr_stream *xdr,
 					 struct nfs41_reclaim_complete_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
-		status = decode_sequence(&xdr, &res->seq_res, rqstp);
+		status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (!status)
-		status = decode_reclaim_complete(&xdr, (void *)NULL);
+		status = decode_reclaim_complete(xdr, (void *)NULL);
 	return status;
 }
 
 /*
  * Decode GETDEVINFO response
  */
-static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+				      struct xdr_stream *xdr,
 				      struct nfs4_getdeviceinfo_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status != 0)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status != 0)
 		goto out;
-	status = decode_getdeviceinfo(&xdr, res->pdev);
+	status = decode_getdeviceinfo(xdr, res->pdev);
 out:
 	return status;
 }
@@ -6136,31 +6042,44 @@ out:
 /*
  * Decode LAYOUTGET response
  */
-static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
 				  struct nfs4_layoutget_res *res)
 {
-	struct xdr_stream xdr;
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_compound_hdr(&xdr, &hdr);
+	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
-	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (status)
 		goto out;
-	status = decode_putfh(&xdr);
+	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_layoutget(&xdr, rqstp, res);
+	status = decode_layoutget(xdr, rqstp, res);
 out:
 	return status;
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
-			   struct nfs_server *server, int plus)
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+		       int plus)
 {
 	uint32_t bitmap[2] = {0};
 	uint32_t len;
@@ -6172,9 +6091,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		if (unlikely(!p))
 			goto out_overflow;
 		if (!ntohl(*p++))
-			return ERR_PTR(-EAGAIN);
+			return -EAGAIN;
 		entry->eof = 1;
-		return ERR_PTR(-EBADCOOKIE);
+		return -EBADCOOKIE;
 	}
 
 	p = xdr_inline_decode(xdr, 12);
@@ -6203,7 +6122,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (decode_attr_length(xdr, &len, &p) < 0)
 		goto out_overflow;
 
-	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+					entry->server, 1) < 0)
 		goto out_overflow;
 	if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
 		entry->ino = entry->fattr->fileid;
@@ -6215,17 +6135,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (verify_attr_len(xdr, p, len) < 0)
 		goto out_overflow;
 
-	p = xdr_inline_peek(xdr, 8);
-	if (p != NULL)
-		entry->eof = !p[0] && p[1];
-	else
-		entry->eof = 0;
-
-	return p;
+	return 0;
 
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 }
 
 /*
@@ -6301,8 +6215,8 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)				\
 [NFSPROC4_CLNT_##proc] = {					\
 	.p_proc   = NFSPROC4_COMPOUND,				\
-	.p_encode = (kxdrproc_t) nfs4_xdr_##argtype,		\
-	.p_decode = (kxdrproc_t) nfs4_xdr_##restype,		\
+	.p_encode = (kxdreproc_t)nfs4_xdr_##argtype,		\
+	.p_decode = (kxdrdproc_t)nfs4_xdr_##restype,		\
 	.p_arglen = NFS4_##argtype##_sz,			\
 	.p_replen = NFS4_##restype##_sz,			\
 	.p_statidx = NFSPROC4_CLNT_##proc,			\
@@ -6310,50 +6224,50 @@ nfs4_stat_to_errno(int stat)
 }
 
 struct rpc_procinfo	nfs4_procedures[] = {
-  PROC(READ,		enc_read,	dec_read),
-  PROC(WRITE,		enc_write,	dec_write),
-  PROC(COMMIT,		enc_commit,	dec_commit),
-  PROC(OPEN,		enc_open,	dec_open),
-  PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm),
-  PROC(OPEN_NOATTR,	enc_open_noattr,	dec_open_noattr),
-  PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade),
-  PROC(CLOSE,		enc_close,	dec_close),
-  PROC(SETATTR,		enc_setattr,	dec_setattr),
-  PROC(FSINFO,		enc_fsinfo,	dec_fsinfo),
-  PROC(RENEW,		enc_renew,	dec_renew),
-  PROC(SETCLIENTID,	enc_setclientid,	dec_setclientid),
-  PROC(SETCLIENTID_CONFIRM,	enc_setclientid_confirm,	dec_setclientid_confirm),
-  PROC(LOCK,            enc_lock,       dec_lock),
-  PROC(LOCKT,           enc_lockt,      dec_lockt),
-  PROC(LOCKU,           enc_locku,      dec_locku),
-  PROC(ACCESS,		enc_access,	dec_access),
-  PROC(GETATTR,		enc_getattr,	dec_getattr),
-  PROC(LOOKUP,		enc_lookup,	dec_lookup),
-  PROC(LOOKUP_ROOT,	enc_lookup_root,	dec_lookup_root),
-  PROC(REMOVE,		enc_remove,	dec_remove),
-  PROC(RENAME,		enc_rename,	dec_rename),
-  PROC(LINK,		enc_link,	dec_link),
-  PROC(SYMLINK,		enc_symlink,	dec_symlink),
-  PROC(CREATE,		enc_create,	dec_create),
-  PROC(PATHCONF,	enc_pathconf,	dec_pathconf),
-  PROC(STATFS,		enc_statfs,	dec_statfs),
-  PROC(READLINK,	enc_readlink,	dec_readlink),
-  PROC(READDIR,		enc_readdir,	dec_readdir),
-  PROC(SERVER_CAPS,	enc_server_caps, dec_server_caps),
-  PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
-  PROC(GETACL,		enc_getacl,	dec_getacl),
-  PROC(SETACL,		enc_setacl,	dec_setacl),
-  PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
-  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+	PROC(READ,		enc_read,		dec_read),
+	PROC(WRITE,		enc_write,		dec_write),
+	PROC(COMMIT,		enc_commit,		dec_commit),
+	PROC(OPEN,		enc_open,		dec_open),
+	PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm),
+	PROC(OPEN_NOATTR,	enc_open_noattr,	dec_open_noattr),
+	PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade),
+	PROC(CLOSE,		enc_close,		dec_close),
+	PROC(SETATTR,		enc_setattr,		dec_setattr),
+	PROC(FSINFO,		enc_fsinfo,		dec_fsinfo),
+	PROC(RENEW,		enc_renew,		dec_renew),
+	PROC(SETCLIENTID,	enc_setclientid,	dec_setclientid),
+	PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
+	PROC(LOCK,		enc_lock,		dec_lock),
+	PROC(LOCKT,		enc_lockt,		dec_lockt),
+	PROC(LOCKU,		enc_locku,		dec_locku),
+	PROC(ACCESS,		enc_access,		dec_access),
+	PROC(GETATTR,		enc_getattr,		dec_getattr),
+	PROC(LOOKUP,		enc_lookup,		dec_lookup),
+	PROC(LOOKUP_ROOT,	enc_lookup_root,	dec_lookup_root),
+	PROC(REMOVE,		enc_remove,		dec_remove),
+	PROC(RENAME,		enc_rename,		dec_rename),
+	PROC(LINK,		enc_link,		dec_link),
+	PROC(SYMLINK,		enc_symlink,		dec_symlink),
+	PROC(CREATE,		enc_create,		dec_create),
+	PROC(PATHCONF,		enc_pathconf,		dec_pathconf),
+	PROC(STATFS,		enc_statfs,		dec_statfs),
+	PROC(READLINK,		enc_readlink,		dec_readlink),
+	PROC(READDIR,		enc_readdir,		dec_readdir),
+	PROC(SERVER_CAPS,	enc_server_caps,	dec_server_caps),
+	PROC(DELEGRETURN,	enc_delegreturn,	dec_delegreturn),
+	PROC(GETACL,		enc_getacl,		dec_getacl),
+	PROC(SETACL,		enc_setacl,		dec_setacl),
+	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),
+	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
-  PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
-  PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
-  PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
-  PROC(SEQUENCE,	enc_sequence,	dec_sequence),
-  PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
-  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
-  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
-  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
+	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
+	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
+	PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session),
+	PROC(SEQUENCE,		enc_sequence,		dec_sequence),
+	PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
+	PROC(RECLAIM_COMPLETE,	enc_reclaim_complete,	dec_reclaim_complete),
+	PROC(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo),
+	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b68536cc904..e1164e3f9e6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-	struct nfs_page	*p;
-	p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
-	if (p) {
-		memset(p, 0, sizeof(*p));
+	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+	if (p)
 		INIT_LIST_HEAD(&p->wb_list);
-	}
 	return p;
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95..bc408976973 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
  * pNFS client layout cache
  */
 
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	atomic_inc(&lo->plh_refcount);
+}
+
 static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-	assert_spin_locked(&lo->inode->i_lock);
-	lo->refcount++;
+	dprintk("%s: freeing layout cache %p\n", __func__, lo);
+	BUG_ON(!list_empty(&lo->plh_layouts));
+	NFS_I(lo->plh_inode)->layout = NULL;
+	kfree(lo);
 }
 
 static void
 put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 {
-	assert_spin_locked(&lo->inode->i_lock);
-	BUG_ON(lo->refcount == 0);
-
-	lo->refcount--;
-	if (!lo->refcount) {
-		dprintk("%s: freeing layout cache %p\n", __func__, lo);
-		BUG_ON(!list_empty(&lo->layouts));
-		NFS_I(lo->inode)->layout = NULL;
-		kfree(lo);
-	}
+	if (atomic_dec_and_test(&lo->plh_refcount))
+		destroy_layout_hdr(lo);
 }
 
 void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-	spin_lock(&inode->i_lock);
-	put_layout_hdr_locked(NFS_I(inode)->layout);
-	spin_unlock(&inode->i_lock);
+	struct inode *inode = lo->plh_inode;
+
+	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+		destroy_layout_hdr(lo);
+		spin_unlock(&inode->i_lock);
+	}
 }
 
 static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
-	INIT_LIST_HEAD(&lseg->fi_list);
-	kref_init(&lseg->kref);
-	lseg->layout = lo;
+	INIT_LIST_HEAD(&lseg->pls_list);
+	atomic_set(&lseg->pls_refcount, 1);
+	smp_mb();
+	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+	lseg->pls_layout = lo;
 }
 
-/* Called without i_lock held, as the free_lseg call may sleep */
-static void
-destroy_lseg(struct kref *kref)
+static void free_lseg(struct pnfs_layout_segment *lseg)
 {
-	struct pnfs_layout_segment *lseg =
-		container_of(kref, struct pnfs_layout_segment, kref);
-	struct inode *ino = lseg->layout->inode;
+	struct inode *ino = lseg->pls_layout->plh_inode;
 
-	dprintk("--> %s\n", __func__);
 	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
-	put_layout_hdr(ino);
+	/* Matched by get_layout_hdr in pnfs_insert_layout */
+	put_layout_hdr(NFS_I(ino)->layout);
 }
 
-static void
-put_lseg(struct pnfs_layout_segment *lseg)
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+		struct list_head *tmp_list)
+{
+	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+		atomic_read(&lseg->pls_refcount),
+		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	if (atomic_dec_and_test(&lseg->pls_refcount)) {
+		struct inode *ino = lseg->pls_layout->plh_inode;
+
+		BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+		list_del(&lseg->pls_list);
+		if (list_empty(&lseg->pls_layout->plh_segs)) {
+			struct nfs_client *clp;
+
+			clp = NFS_SERVER(ino)->nfs_client;
+			spin_lock(&clp->cl_lock);
+			/* List does not take a reference, so no need for put here */
+			list_del_init(&lseg->pls_layout->plh_layouts);
+			spin_unlock(&clp->cl_lock);
+			clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+		}
+		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+		list_add(&lseg->pls_list, tmp_list);
+		return 1;
+	}
+	return 0;
+}
+
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 {
-	if (!lseg)
-		return;
+	return (recall_iomode == IOMODE_ANY ||
+		lseg_iomode == recall_iomode);
+}
 
-	dprintk("%s: lseg %p ref %d\n", __func__, lseg,
-		atomic_read(&lseg->kref.refcount));
-	kref_put(&lseg->kref, destroy_lseg);
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+			     struct list_head *tmp_list)
+{
+	int rv = 0;
+
+	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+		/* Remove the reference keeping the lseg in the
+		 * list.  It will now be removed when all
+		 * outstanding io is finished.
+		 */
+		rv = put_lseg_locked(lseg, tmp_list);
+	}
+	return rv;
 }
 
-static void
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+/* Returns count of number of matching invalid lsegs remaining in list
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+			    struct list_head *tmp_list,
+			    u32 iomode)
 {
 	struct pnfs_layout_segment *lseg, *next;
-	struct nfs_client *clp;
+	int invalid = 0, removed = 0;
 
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
-	assert_spin_locked(&lo->inode->i_lock);
-	list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
-		dprintk("%s: freeing lseg %p\n", __func__, lseg);
-		list_move(&lseg->fi_list, tmp_list);
-	}
-	clp = NFS_SERVER(lo->inode)->nfs_client;
-	spin_lock(&clp->cl_lock);
-	/* List does not take a reference, so no need for put here */
-	list_del_init(&lo->layouts);
-	spin_unlock(&clp->cl_lock);
-	write_seqlock(&lo->seqlock);
-	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-	write_sequnlock(&lo->seqlock);
-
-	dprintk("%s:Return\n", __func__);
+	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+		if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+			dprintk("%s: freeing lseg %p iomode %d "
+				"offset %llu length %llu\n", __func__,
+				lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+				lseg->pls_range.length);
+			invalid++;
+			removed += mark_lseg_invalid(lseg, tmp_list);
+		}
+	dprintk("%s:Return %i\n", __func__, invalid - removed);
+	return invalid - removed;
 }
 
-static void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+void
+pnfs_free_lseg_list(struct list_head *free_me)
 {
-	struct pnfs_layout_segment *lseg;
+	struct pnfs_layout_segment *lseg, *tmp;
 
-	while (!list_empty(tmp_list)) {
-		lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
-				fi_list);
-		dprintk("%s calling put_lseg on %p\n", __func__, lseg);
-		list_del(&lseg->fi_list);
-		put_lseg(lseg);
+	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+		list_del(&lseg->pls_list);
+		free_lseg(lseg);
 	}
 }
 
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	spin_lock(&nfsi->vfs_inode.i_lock);
 	lo = nfsi->layout;
 	if (lo) {
-		pnfs_clear_lseg_list(lo, &tmp_list);
+		set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+		mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
 		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
 		put_layout_hdr_locked(lo);
 	}
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
 
 	while (!list_empty(&tmp_list)) {
 		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-				layouts);
+				plh_layouts);
 		dprintk("%s freeing layout for inode %lu\n", __func__,
-			lo->inode->i_ino);
-		pnfs_destroy_layout(NFS_I(lo->inode));
+			lo->plh_inode->i_ino);
+		pnfs_destroy_layout(NFS_I(lo->plh_inode));
 	}
 }
 
-/* update lo->stateid with new if is more recent
- *
- * lo->stateid could be the open stateid, in which case we just use what given.
- */
-static void
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-			const nfs4_stateid *new)
-{
-	nfs4_stateid *old = &lo->stateid;
-	bool overwrite = false;
-
-	write_seqlock(&lo->seqlock);
-	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
-	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-		overwrite = true;
-	else {
-		u32 oldseq, newseq;
-
-		oldseq = be32_to_cpu(old->stateid.seqid);
-		newseq = be32_to_cpu(new->stateid.seqid);
-		if ((int)(newseq - oldseq) > 0)
-			overwrite = true;
+/* update lo->plh_stateid with new if is more recent */
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+			bool update_barrier)
+{
+	u32 oldseq, newseq;
+
+	oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+	newseq = be32_to_cpu(new->stateid.seqid);
+	if ((int)(newseq - oldseq) > 0) {
+		memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+		if (update_barrier) {
+			u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+
+			if ((int)(new_barrier - lo->plh_barrier))
+				lo->plh_barrier = new_barrier;
+		} else {
+			/* Because of wraparound, we want to keep the barrier
+			 * "close" to the current seqids.  It needs to be
+			 * within 2**31 to count as "behind", so if it
+			 * gets too near that limit, give us a litle leeway
+			 * and bring it to within 2**30.
+			 * NOTE - and yes, this is all unsigned arithmetic.
+			 */
+			if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+				lo->plh_barrier = newseq - (1 << 30);
+		}
 	}
-	if (overwrite)
-		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
-	write_sequnlock(&lo->seqlock);
 }
 
-static void
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
-			      struct nfs4_state *state)
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+			int lget)
 {
-	int seq;
-
-	dprintk("--> %s\n", __func__);
-	write_seqlock(&lo->seqlock);
-	do {
-		seq = read_seqbegin(&state->seqlock);
-		memcpy(lo->stateid.data, state->stateid.data,
-		       sizeof(state->stateid.data));
-	} while (read_seqretry(&state->seqlock, seq));
-	set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-	write_sequnlock(&lo->seqlock);
-	dprintk("<-- %s\n", __func__);
+	if ((stateid) &&
+	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+		return true;
+	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+		(list_empty(&lo->plh_segs) &&
+		 (atomic_read(&lo->plh_outstanding) > lget));
 }
 
-void
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			struct nfs4_state *open_state)
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			      struct nfs4_state *open_state)
 {
-	int seq;
+	int status = 0;
 
 	dprintk("--> %s\n", __func__);
-	do {
-		seq = read_seqbegin(&lo->seqlock);
-		if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
-			/* This will trigger retry of the read */
-			pnfs_layout_from_open_stateid(lo, open_state);
-		} else
-			memcpy(dst->data, lo->stateid.data,
-			       sizeof(lo->stateid.data));
-	} while (read_seqretry(&lo->seqlock, seq));
+	spin_lock(&lo->plh_inode->i_lock);
+	if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
+		status = -EAGAIN;
+	} else if (list_empty(&lo->plh_segs)) {
+		int seq;
+
+		do {
+			seq = read_seqbegin(&open_state->seqlock);
+			memcpy(dst->data, open_state->stateid.data,
+			       sizeof(open_state->stateid.data));
+		} while (read_seqretry(&open_state->seqlock, seq));
+	} else
+		memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+	spin_unlock(&lo->plh_inode->i_lock);
 	dprintk("<-- %s\n", __func__);
+	return status;
 }
 
 /*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
 	   u32 iomode)
 {
-	struct inode *ino = lo->inode;
+	struct inode *ino = lo->plh_inode;
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs4_layoutget *lgp;
 	struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 
 	BUG_ON(ctx == NULL);
 	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-	if (lgp == NULL) {
-		put_layout_hdr(lo->inode);
+	if (lgp == NULL)
 		return NULL;
-	}
 	lgp->args.minlength = NFS4_MAX_UINT64;
 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 	lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	nfs4_proc_layoutget(lgp);
 	if (!lseg) {
 		/* remember that LAYOUTGET failed and suspend trying */
-		set_bit(lo_fail_bit(iomode), &lo->state);
+		set_bit(lo_fail_bit(iomode), &lo->plh_flags);
 	}
 	return lseg;
 }
 
+bool pnfs_roc(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg, *tmp;
+	LIST_HEAD(tmp_list);
+	bool found = false;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+		goto out_nolayout;
+	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			mark_lseg_invalid(lseg, &tmp_list);
+			found = true;
+		}
+	if (!found)
+		goto out_nolayout;
+	lo->plh_block_lgets++;
+	get_layout_hdr(lo); /* matched in pnfs_roc_release */
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+	return true;
+
+out_nolayout:
+	spin_unlock(&ino->i_lock);
+	return false;
+}
+
+void pnfs_roc_release(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	lo->plh_block_lgets--;
+	put_layout_hdr_locked(lo);
+	spin_unlock(&ino->i_lock);
+}
+
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if ((int)(barrier - lo->plh_barrier) > 0)
+		lo->plh_barrier = barrier;
+	spin_unlock(&ino->i_lock);
+}
+
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_segment *lseg;
+	bool found = false;
+
+	spin_lock(&ino->i_lock);
+	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+			found = true;
+			break;
+		}
+	if (!found) {
+		struct pnfs_layout_hdr *lo = nfsi->layout;
+		u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+
+		/* Since close does not return a layout stateid for use as
+		 * a barrier, we choose the worst-case barrier.
+		 */
+		*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+	}
+	spin_unlock(&ino->i_lock);
+	return found;
+}
+
 /*
  * Compare two layout segments for sorting into layout cache.
  * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 
 	dprintk("%s:Begin\n", __func__);
 
-	assert_spin_locked(&lo->inode->i_lock);
-	if (list_empty(&lo->segs)) {
-		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
-
-		spin_lock(&clp->cl_lock);
-		BUG_ON(!list_empty(&lo->layouts));
-		list_add_tail(&lo->layouts, &clp->cl_layouts);
-		spin_unlock(&clp->cl_lock);
-	}
-	list_for_each_entry(lp, &lo->segs, fi_list) {
-		if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+	assert_spin_locked(&lo->plh_inode->i_lock);
+	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
+		if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
 			continue;
-		list_add_tail(&lseg->fi_list, &lp->fi_list);
+		list_add_tail(&lseg->pls_list, &lp->pls_list);
 		dprintk("%s: inserted lseg %p "
 			"iomode %d offset %llu length %llu before "
 			"lp %p iomode %d offset %llu length %llu\n",
-			__func__, lseg, lseg->range.iomode,
-			lseg->range.offset, lseg->range.length,
-			lp, lp->range.iomode, lp->range.offset,
-			lp->range.length);
+			__func__, lseg, lseg->pls_range.iomode,
+			lseg->pls_range.offset, lseg->pls_range.length,
+			lp, lp->pls_range.iomode, lp->pls_range.offset,
+			lp->pls_range.length);
 		found = 1;
 		break;
 	}
 	if (!found) {
-		list_add_tail(&lseg->fi_list, &lo->segs);
+		list_add_tail(&lseg->pls_list, &lo->plh_segs);
 		dprintk("%s: inserted lseg %p "
 			"iomode %d offset %llu length %llu at tail\n",
-			__func__, lseg, lseg->range.iomode,
-			lseg->range.offset, lseg->range.length);
+			__func__, lseg, lseg->pls_range.iomode,
+			lseg->pls_range.offset, lseg->pls_range.length);
 	}
-	get_layout_hdr_locked(lo);
+	get_layout_hdr(lo);
 
 	dprintk("%s:Return\n", __func__);
 }
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
 	lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
 	if (!lo)
 		return NULL;
-	lo->refcount = 1;
-	INIT_LIST_HEAD(&lo->layouts);
-	INIT_LIST_HEAD(&lo->segs);
-	seqlock_init(&lo->seqlock);
-	lo->inode = ino;
+	atomic_set(&lo->plh_refcount, 1);
+	INIT_LIST_HEAD(&lo->plh_layouts);
+	INIT_LIST_HEAD(&lo->plh_segs);
+	INIT_LIST_HEAD(&lo->plh_bulk_recall);
+	lo->plh_inode = ino;
 	return lo;
 }
 
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
 	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
 
 	assert_spin_locked(&ino->i_lock);
-	if (nfsi->layout)
-		return nfsi->layout;
-
+	if (nfsi->layout) {
+		if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+			return NULL;
+		else
+			return nfsi->layout;
+	}
 	spin_unlock(&ino->i_lock);
 	new = alloc_init_layout_hdr(ino);
 	spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
 static int
 is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
 {
-	return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+	return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
 }
 
 /*
  * lookup range in layout
  */
 static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 {
 	struct pnfs_layout_segment *lseg, *ret = NULL;
 
 	dprintk("%s:Begin\n", __func__);
 
-	assert_spin_locked(&lo->inode->i_lock);
-	list_for_each_entry(lseg, &lo->segs, fi_list) {
-		if (is_matching_lseg(lseg, iomode)) {
+	assert_spin_locked(&lo->plh_inode->i_lock);
+	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+		    is_matching_lseg(lseg, iomode)) {
 			ret = lseg;
 			break;
 		}
-		if (cmp_layout(iomode, lseg->range.iomode) > 0)
+		if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
 			break;
 	}
 
 	dprintk("%s:Return lseg %p ref %d\n",
-		__func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
 	return ret;
 }
 
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
 		   enum pnfs_iomode iomode)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_segment *lseg = NULL;
 
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
 		goto out_unlock;
 	}
 
-	/* Check to see if the layout for the given range already exists */
-	lseg = pnfs_has_layout(lo, iomode);
-	if (lseg) {
-		dprintk("%s: Using cached lseg %p for iomode %d)\n",
-			__func__, lseg, iomode);
+	/* Do we even need to bother with this? */
+	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		dprintk("%s matches recall, use MDS\n", __func__);
 		goto out_unlock;
 	}
+	/* Check to see if the layout for the given range already exists */
+	lseg = pnfs_find_lseg(lo, iomode);
+	if (lseg)
+		goto out_unlock;
 
 	/* if LAYOUTGET already failed once we don't try again */
-	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+		goto out_unlock;
+
+	if (pnfs_layoutgets_blocked(lo, NULL, 0))
 		goto out_unlock;
+	atomic_inc(&lo->plh_outstanding);
 
-	get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+	get_layout_hdr(lo);
+	if (list_empty(&lo->plh_segs)) {
+		/* The lo must be on the clp list if there is any
+		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+		 */
+		spin_lock(&clp->cl_lock);
+		BUG_ON(!list_empty(&lo->plh_layouts));
+		list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
 	spin_unlock(&ino->i_lock);
 
 	lseg = send_layoutget(lo, ctx, iomode);
+	if (!lseg) {
+		spin_lock(&ino->i_lock);
+		if (list_empty(&lo->plh_segs)) {
+			spin_lock(&clp->cl_lock);
+			list_del_init(&lo->plh_layouts);
+			spin_unlock(&clp->cl_lock);
+			clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		}
+		spin_unlock(&ino->i_lock);
+	}
+	atomic_dec(&lo->plh_outstanding);
+	put_layout_hdr(lo);
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-		nfsi->layout->state, lseg);
+		nfsi->layout->plh_flags, lseg);
 	return lseg;
 out_unlock:
 	spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
 	struct nfs4_layoutget_res *res = &lgp->res;
 	struct pnfs_layout_segment *lseg;
-	struct inode *ino = lo->inode;
+	struct inode *ino = lo->plh_inode;
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	int status = 0;
 
+	/* Verify we got what we asked for.
+	 * Note that because the xdr parsing only accepts a single
+	 * element array, this can fail even if the server is behaving
+	 * correctly.
+	 */
+	if (lgp->args.range.iomode > res->range.iomode ||
+	    res->range.offset != 0 ||
+	    res->range.length != NFS4_MAX_UINT64) {
+		status = -EINVAL;
+		goto out;
+	}
 	/* Inject layout blob into I/O device driver */
 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
 	if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	}
 
 	spin_lock(&ino->i_lock);
+	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		dprintk("%s forget reply due to recall\n", __func__);
+		goto out_forget_reply;
+	}
+
+	if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+		dprintk("%s forget reply due to state\n", __func__);
+		goto out_forget_reply;
+	}
 	init_lseg(lo, lseg);
-	lseg->range = res->range;
+	lseg->pls_range = res->range;
 	*lgp->lsegpp = lseg;
 	pnfs_insert_layout(lo, lseg);
 
+	if (res->return_on_close) {
+		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+	}
+
 	/* Done processing layoutget. Set the layout stateid */
-	pnfs_set_layout_stateid(lo, &res->stateid);
+	pnfs_set_layout_stateid(lo, &res->stateid, false);
 	spin_unlock(&ino->i_lock);
 out:
 	return status;
+
+out_forget_reply:
+	spin_unlock(&ino->i_lock);
+	lseg->pls_layout = lo;
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	goto out;
 }
 
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d5048..e2612ea0cbe 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
+enum {
+	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
+	NFS_LSEG_ROC,		/* roc bit received from server */
+};
+
 struct pnfs_layout_segment {
-	struct list_head fi_list;
-	struct pnfs_layout_range range;
-	struct kref kref;
-	struct pnfs_layout_hdr *layout;
+	struct list_head pls_list;
+	struct pnfs_layout_range pls_range;
+	atomic_t pls_refcount;
+	unsigned long pls_flags;
+	struct pnfs_layout_hdr *pls_layout;
 };
 
 #ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
 enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
-	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */
+	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
+	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
+	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
 };
 
 /* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
 };
 
 struct pnfs_layout_hdr {
-	unsigned long		refcount;
-	struct list_head	layouts;   /* other client layouts */
-	struct list_head	segs;      /* layout segments list */
-	seqlock_t		seqlock;   /* Protects the stateid */
-	nfs4_stateid		stateid;
-	unsigned long		state;
-	struct inode		*inode;
+	atomic_t		plh_refcount;
+	struct list_head	plh_layouts;   /* other client layouts */
+	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
+	struct list_head	plh_segs;      /* layout segments list */
+	nfs4_stateid		plh_stateid;
+	atomic_t		plh_outstanding; /* number of RPCs out */
+	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
+	u32			plh_barrier; /* ignore lower seqids */
+	unsigned long		plh_flags;
+	struct inode		*plh_inode;
 };
 
 struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 
 /* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			     struct nfs4_state *open_state);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+			     const nfs4_stateid *new,
+			     bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+				  struct pnfs_layout_hdr *lo,
+				  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+				struct list_head *tmp_list,
+				u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 
 
 static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 	return NULL;
 }
 
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+	return false;
+}
+
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+	return false;
+}
+
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1f..77d5e21c4ad 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 	fattr = nfs_alloc_fattr();
 	status = -ENOMEM;
 	if (fh == NULL || fattr == NULL)
-		goto out;
+		goto out_free;
 
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 	if (status == 0)
 		status = nfs_instantiate(dentry, fh, fattr);
 
+out_free:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fh);
 out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.statfs		= nfs_proc_statfs,
 	.fsinfo		= nfs_proc_fsinfo,
 	.pathconf	= nfs_proc_pathconf,
-	.decode_dirent	= nfs_decode_dirent,
+	.decode_dirent	= nfs2_decode_dirent,
 	.read_setup	= nfs_proc_read_setup,
 	.read_done	= nfs_read_done,
 	.write_setup	= nfs_proc_write_setup,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4100630c9a5..b68c8607770 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -598,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 
 	if (nfss->mountd_version || showdefaults)
 		seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-	if (nfss->mountd_port || showdefaults)
+	if ((nfss->mountd_port &&
+		nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+		showdefaults)
 		seq_printf(m, ",mountport=%u", nfss->mountd_port);
 
 	nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -2200,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
 
 	s->s_flags = sb_mntdata->mntflags;
 	s->s_fs_info = server;
+	s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
 	ret = set_anon_super(s, server);
 	if (ret == 0)
 		server->s_dev = s->s_dev;
@@ -2494,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
 	sb->s_maxbytes = old_sb->s_maxbytes;
 	sb->s_time_gran = 1;
 	sb->s_op = old_sb->s_op;
- 	nfs_initialise_sb(sb);
+	/*
+	 * The VFS shouldn't apply the umask to mode bits. We will do
+	 * so ourselves when necessary.
+	 */
+	sb->s_flags  |= MS_POSIXACL;
+	sb->s_xattr  = old_sb->s_xattr;
+	nfs_initialise_sb(sb);
 }
 
 /*
@@ -2504,6 +2513,12 @@ static void nfs4_fill_super(struct super_block *sb)
 {
 	sb->s_time_gran = 1;
 	sb->s_op = &nfs4_sops;
+	/*
+	 * The VFS shouldn't apply the umask to mode bits. We will do
+	 * so ourselves when necessary.
+	 */
+	sb->s_flags  |= MS_POSIXACL;
+	sb->s_xattr = nfs4_xattr_handlers;
 	nfs_initialise_sb(sb);
 }
 
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8fe9eb47a97..e313a51acdd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
 		return ERR_PTR(-ENOMEM);
-	task_setup_data.callback_data = data,
+	task_setup_data.callback_data = data;
 
 	data->cred = rpc_lookup_cred();
 	if (IS_ERR(data->cred)) {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 00000000000..34e5c40af5e
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+#include <linux/posix_acl.h>
+
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+		                        uid_t who, u32 mask);
+
+#define NFS4_ACL_TYPE_DEFAULT	0x01
+#define NFS4_ACL_DIR		0x02
+#define NFS4_ACL_OWNER		0x04
+
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+				struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+				struct posix_acl **, unsigned int flags);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6..8b31e5f8795 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS	/* HACK HACK */
 /*
  * NFS exporting and validation.
  *
@@ -1444,9 +1443,6 @@ static struct flags {
 	{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
 	{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
 	{ NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
-	{ NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
 	{ 0, {"", ""}}
 };
 
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 00000000000..2f3be132153
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+	return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06..2247fc91d5e 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	__be32	nfserr;
 	u32	max_blocksize = svc_max_payload(rqstp);
 
-	dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
+	dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
 				SVCFH_fmt(&argp->fh),
 				(unsigned long) argp->count,
-				(unsigned long) argp->offset);
+				(unsigned long long) argp->offset);
 
 	/* Obtain buffer pointer for payload.
 	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	__be32	nfserr;
 	unsigned long cnt = argp->len;
 
-	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
+	dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
 				SVCFH_fmt(&argp->fh),
 				argp->len,
-				(unsigned long) argp->offset,
+				(unsigned long long) argp->offset,
 				argp->stable? " stable" : "");
 
 	fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e4805261515..ad88f1c0a4c 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
 
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
 
 
 /* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7..3be975e1891 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
-enum nfs_cb_opnum4 {
-	OP_CB_RECALL            = 4,
-	OP_CB_SEQUENCE          = 11,
-};
-
 #define NFS4_MAXTAGLEN		20
 
 #define NFS4_enc_cb_null_sz		0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
-/*
-* Generic encode routines from fs/nfs/nfs4xdr.c
-*/
-static inline __be32 *
-xdr_writemem(__be32 *p, const void *ptr, int nbytes)
-{
-	int tmp = XDR_QUADLEN(nbytes);
-	if (!tmp)
-		return p;
-	p[tmp-1] = 0;
-	memcpy(p, ptr, nbytes);
-	return p + tmp;
-}
-
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-	p = xdr_writemem(p, ptr, nbytes);                       \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-	p = xdr_reserve_space(xdr, nbytes);                     \
-	if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
-	BUG_ON(!p);                                             \
-} while (0)
-
-/*
- * Generic decode routines from fs/nfs/nfs4xdr.c
- */
-#define DECODE_TAIL                             \
-	status = 0;                             \
-out:                                            \
-	return status;                          \
-xdr_error:                                      \
-	dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
-	status = -EIO;                          \
-	goto out
-
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-	(x) = (u64)ntohl(*p++) << 32;           \
-	(x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-	p++;                                    \
-	(x.tv_sec) = ntohl(*p++);               \
-	(x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-	p = xdr_inline_decode(xdr, nbytes); \
-	if (!p) { \
-		dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-			__func__, __LINE__); \
-		return -EIO; \
-	} \
-} while (0)
-
 struct nfs4_cb_compound_hdr {
 	/* args */
 	u32		ident;	/* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
 	int		status;
 };
 
-static struct {
-int stat;
-int errno;
-} nfs_cb_errtbl[] = {
-	{ NFS4_OK,		0               },
-	{ NFS4ERR_PERM,		EPERM           },
-	{ NFS4ERR_NOENT,	ENOENT          },
-	{ NFS4ERR_IO,		EIO             },
-	{ NFS4ERR_NXIO,		ENXIO           },
-	{ NFS4ERR_ACCESS,	EACCES          },
-	{ NFS4ERR_EXIST,	EEXIST          },
-	{ NFS4ERR_XDEV,		EXDEV           },
-	{ NFS4ERR_NOTDIR,	ENOTDIR         },
-	{ NFS4ERR_ISDIR,	EISDIR          },
-	{ NFS4ERR_INVAL,	EINVAL          },
-	{ NFS4ERR_FBIG,		EFBIG           },
-	{ NFS4ERR_NOSPC,	ENOSPC          },
-	{ NFS4ERR_ROFS,		EROFS           },
-	{ NFS4ERR_MLINK,	EMLINK          },
-	{ NFS4ERR_NAMETOOLONG,	ENAMETOOLONG    },
-	{ NFS4ERR_NOTEMPTY,	ENOTEMPTY       },
-	{ NFS4ERR_DQUOT,	EDQUOT          },
-	{ NFS4ERR_STALE,	ESTALE          },
-	{ NFS4ERR_BADHANDLE,	EBADHANDLE      },
-	{ NFS4ERR_BAD_COOKIE,	EBADCOOKIE      },
-	{ NFS4ERR_NOTSUPP,	ENOTSUPP        },
-	{ NFS4ERR_TOOSMALL,	ETOOSMALL       },
-	{ NFS4ERR_SERVERFAULT,	ESERVERFAULT    },
-	{ NFS4ERR_BADTYPE,	EBADTYPE        },
-	{ NFS4ERR_LOCKED,	EAGAIN          },
-	{ NFS4ERR_RESOURCE,	EREMOTEIO       },
-	{ NFS4ERR_SYMLINK,	ELOOP           },
-	{ NFS4ERR_OP_ILLEGAL,	EOPNOTSUPP      },
-	{ NFS4ERR_DEADLOCK,	EDEADLK         },
-	{ -1,                   EIO             }
-};
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
 
-static int
-nfs_cb_stat_to_errno(int stat)
+static __be32 *xdr_encode_empty_array(__be32 *p)
 {
-	int i;
-	for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
-		if (nfs_cb_errtbl[i].stat == stat)
-			return nfs_cb_errtbl[i].errno;
-	}
-	/* If we cannot translate the error, the recovery routines should
-	* handle it.
-	* Note: remaining NFSv4 error codes have values > 10000, so should
-	* not conflict with native Linux error codes.
-	*/
-	return stat;
+	*p++ = xdr_zero;
+	return p;
 }
 
 /*
- * XDR encode
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
  */
 
-static void
-encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+/*
+ *	nfs_cb_opnum4
+ *
+ *	enum nfs_cb_opnum4 {
+ *		OP_CB_GETATTR		= 3,
+ *		  ...
+ *	};
+ */
+enum nfs_cb_opnum4 {
+	OP_CB_GETATTR			= 3,
+	OP_CB_RECALL			= 4,
+	OP_CB_LAYOUTRECALL		= 5,
+	OP_CB_NOTIFY			= 6,
+	OP_CB_PUSH_DELEG		= 7,
+	OP_CB_RECALL_ANY		= 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL	= 9,
+	OP_CB_RECALL_SLOT		= 10,
+	OP_CB_SEQUENCE			= 11,
+	OP_CB_WANTS_CANCELLED		= 12,
+	OP_CB_NOTIFY_LOCK		= 13,
+	OP_CB_NOTIFY_DEVICEID		= 14,
+	OP_CB_ILLEGAL			= 10044
+};
+
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(sizeof(stateid_t));
-	WRITE32(sid->si_generation);
-	WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(op);
 }
 
-static void
-encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
+/*
+ * nfs_fh4
+ *
+ *	typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
 {
-	__be32 * p;
+	u32 length = fh->fh_size;
+	__be32 *p;
 
-	RESERVE_SPACE(16);
-	WRITE32(0);            /* tag length is always 0 */
-	WRITE32(hdr->minorversion);
-	WRITE32(hdr->ident);
-	hdr->nops_p = p;
-	WRITE32(hdr->nops);
+	BUG_ON(length > NFS4_FHSIZE);
+	p = xdr_reserve_space(xdr, 4 + length);
+	xdr_encode_opaque(p, &fh->fh_base, length);
 }
 
-static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+/*
+ * stateid4
+ *
+ *	struct stateid4 {
+ *		uint32_t	seqid;
+ *		opaque		other[12];
+ *	};
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
-	*hdr->nops_p = htonl(hdr->nops);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(sid->si_generation);
+	xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
 }
 
-static void
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
-		struct nfs4_cb_compound_hdr *hdr)
+/*
+ * sessionid4
+ *
+ *	typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+			      const struct nfsd4_session *session)
 {
 	__be32 *p;
-	int len = dp->dl_fh.fh_size;
-
-	RESERVE_SPACE(4);
-	WRITE32(OP_CB_RECALL);
-	encode_stateid(xdr, &dp->dl_stateid);
-	RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
-	WRITE32(0); /* truncate optimization not implemented */
-	WRITE32(len);
-	WRITEMEM(&dp->dl_fh.fh_base, len);
-	hdr->nops++;
+
+	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+	xdr_encode_opaque_fixed(p, session->se_sessionid.data,
+					NFS4_MAX_SESSIONID_LEN);
 }
 
-static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
-		   struct nfs4_cb_compound_hdr *hdr)
-{
-	__be32 *p;
-	struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+/*
+ * nfsstat4
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_cb_errtbl[] = {
+	{ NFS4_OK,		0		},
+	{ NFS4ERR_PERM,		-EPERM		},
+	{ NFS4ERR_NOENT,	-ENOENT		},
+	{ NFS4ERR_IO,		-EIO		},
+	{ NFS4ERR_NXIO,		-ENXIO		},
+	{ NFS4ERR_ACCESS,	-EACCES		},
+	{ NFS4ERR_EXIST,	-EEXIST		},
+	{ NFS4ERR_XDEV,		-EXDEV		},
+	{ NFS4ERR_NOTDIR,	-ENOTDIR	},
+	{ NFS4ERR_ISDIR,	-EISDIR		},
+	{ NFS4ERR_INVAL,	-EINVAL		},
+	{ NFS4ERR_FBIG,		-EFBIG		},
+	{ NFS4ERR_NOSPC,	-ENOSPC		},
+	{ NFS4ERR_ROFS,		-EROFS		},
+	{ NFS4ERR_MLINK,	-EMLINK		},
+	{ NFS4ERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFS4ERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFS4ERR_DQUOT,	-EDQUOT		},
+	{ NFS4ERR_STALE,	-ESTALE		},
+	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_RESOURCE,	-EREMOTEIO	},
+	{ NFS4ERR_SYMLINK,	-ELOOP		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
+	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+	{ -1,			-EIO		}
+};
 
-	if (hdr->minorversion == 0)
-		return;
+/*
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+	int i;
 
-	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+	for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+		if (nfs_cb_errtbl[i].stat == status)
+			return nfs_cb_errtbl[i].errno;
+	}
 
-	WRITE32(OP_CB_SEQUENCE);
-	WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(ses->se_cb_seq_nr);
-	WRITE32(0);		/* slotid, always 0 */
-	WRITE32(0);		/* highest slotid always 0 */
-	WRITE32(0);		/* cachethis always 0 */
-	WRITE32(0); /* FIXME: support referring_call_lists */
-	hdr->nops++;
+	dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
+	return -status;
 }
 
-static int
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+			       enum nfsstat4 *status)
 {
-	struct xdr_stream xdrs, *xdr = &xdrs;
+	__be32 *p;
+	u32 op;
 
-	xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
-        RESERVE_SPACE(0);
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	op = be32_to_cpup(p++);
+	if (unlikely(op != expected))
+		goto out_unexpected;
+	*status = be32_to_cpup(p);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+out_unexpected:
+	dprintk("NFSD: Callback server returned operation %d but "
+		"we issued a request for %d\n", op, expected);
+	return -EIO;
 }
 
-static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-		struct nfsd4_callback *cb)
+/*
+ * CB_COMPOUND4args
+ *
+ *	struct CB_COMPOUND4args {
+ *		utf8str_cs	tag;
+ *		uint32_t	minorversion;
+ *		uint32_t	callback_ident;
+ *		nfs_cb_argop4	argarray<>;
+ *	};
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+				    struct nfs4_cb_compound_hdr *hdr)
 {
-	struct xdr_stream xdr;
-	struct nfs4_delegation *args = cb->cb_op;
-	struct nfs4_cb_compound_hdr hdr = {
-		.ident = cb->cb_clp->cl_cb_ident,
-		.minorversion = cb->cb_minorversion,
-	};
+	__be32 * p;
 
-	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-	encode_cb_compound_hdr(&xdr, &hdr);
-	encode_cb_sequence(&xdr, cb, &hdr);
-	encode_cb_recall(&xdr, args, &hdr);
-	encode_cb_nops(&hdr);
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+	p = xdr_encode_empty_array(p);		/* empty tag */
+	*p++ = cpu_to_be32(hdr->minorversion);
+	*p++ = cpu_to_be32(hdr->ident);
+
+	hdr->nops_p = p;
+	*p = cpu_to_be32(hdr->nops);		/* argarray element count */
+}
+
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+	BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+	*hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+
+/*
+ * CB_COMPOUND4res
+ *
+ *	struct CB_COMPOUND4res {
+ *		nfsstat4	status;
+ *		utf8str_cs	tag;
+ *		nfs_cb_resop4	resarray<>;
+ *	};
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	u32 length;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	hdr->status = be32_to_cpup(p++);
+	/* Ignore the tag */
+	length = be32_to_cpup(p++);
+	p = xdr_inline_decode(xdr, length + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
+	hdr->nops = be32_to_cpup(p);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
+/*
+ * CB_RECALL4args
+ *
+ *	struct CB_RECALL4args {
+ *		stateid4	stateid;
+ *		bool		truncate;
+ *		nfs_fh4		fh;
+ *	};
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+				  const struct nfs4_delegation *dp,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+	encode_stateid4(xdr, &dp->dl_stateid);
+
+	p = xdr_reserve_space(xdr, 4);
+	*p++ = xdr_zero;			/* truncate */
 
-static int
-decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        __be32 *p;
-	u32 taglen;
+	encode_nfs_fh4(xdr, &dp->dl_fh);
 
-        READ_BUF(8);
-        READ32(hdr->status);
-	/* We've got no use for the tag; ignore it: */
-        READ32(taglen);
-        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(taglen);
-        READ32(hdr->nops);
-        return 0;
+	hdr->nops++;
 }
 
-static int
-decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+/*
+ * CB_SEQUENCE4args
+ *
+ *	struct CB_SEQUENCE4args {
+ *		sessionid4		csa_sessionid;
+ *		sequenceid4		csa_sequenceid;
+ *		slotid4			csa_slotid;
+ *		slotid4			csa_highest_slotid;
+ *		bool			csa_cachethis;
+ *		referring_call_list4	csa_referring_call_lists<>;
+ *	};
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+				    const struct nfsd4_callback *cb,
+				    struct nfs4_cb_compound_hdr *hdr)
 {
+	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
 	__be32 *p;
-	u32 op;
-	int32_t nfserr;
-
-	READ_BUF(8);
-	READ32(op);
-	if (op != expected) {
-		dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
-		         " operation %d but we issued a request for %d\n",
-		         op, expected);
-		return -EIO;
-	}
-	READ32(nfserr);
-	if (nfserr != NFS_OK)
-		return -nfs_cb_stat_to_errno(nfserr);
-	return 0;
+
+	if (hdr->minorversion == 0)
+		return;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
+	encode_sessionid4(xdr, session);
+
+	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
+	*p++ = cpu_to_be32(session->se_cb_seq_nr);	/* csa_sequenceid */
+	*p++ = xdr_zero;			/* csa_slotid */
+	*p++ = xdr_zero;			/* csa_highest_slotid */
+	*p++ = xdr_zero;			/* csa_cachethis */
+	xdr_encode_empty_array(p);		/* csa_referring_call_lists */
+
+	hdr->nops++;
 }
 
 /*
+ * CB_SEQUENCE4resok
+ *
+ *	struct CB_SEQUENCE4resok {
+ *		sessionid4	csr_sessionid;
+ *		sequenceid4	csr_sequenceid;
+ *		slotid4		csr_slotid;
+ *		slotid4		csr_highest_slotid;
+ *		slotid4		csr_target_highest_slotid;
+ *	};
+ *
+ *	union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *	case NFS4_OK:
+ *		CB_SEQUENCE4resok	csr_resok4;
+ *	default:
+ *		void;
+ *	};
+ *
  * Our current back channel implmentation supports a single backchannel
  * with a single slot.
  */
-static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
-		   struct rpc_rqst *rqstp)
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
+				    struct nfsd4_callback *cb)
 {
-	struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+	struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
 	struct nfs4_sessionid id;
 	int status;
-	u32 dummy;
 	__be32 *p;
+	u32 dummy;
 
-	if (cb->cb_minorversion == 0)
-		return 0;
-
-	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
-	if (status)
-		return status;
+	status = -ESERVERFAULT;
 
 	/*
 	 * If the server returns different values for sessionID, slotID or
 	 * sequence number, the server is looney tunes.
 	 */
-	status = -ESERVERFAULT;
-
-	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+	if (unlikely(p == NULL))
+		goto out_overflow;
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-	if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
-		dprintk("%s Invalid session id\n", __func__);
+	if (memcmp(id.data, session->se_sessionid.data,
+					NFS4_MAX_SESSIONID_LEN) != 0) {
+		dprintk("NFS: %s Invalid session id\n", __func__);
 		goto out;
 	}
-	READ32(dummy);
-	if (dummy != ses->se_cb_seq_nr) {
-		dprintk("%s Invalid sequence number\n", __func__);
+	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+
+	dummy = be32_to_cpup(p++);
+	if (dummy != session->se_cb_seq_nr) {
+		dprintk("NFS: %s Invalid sequence number\n", __func__);
 		goto out;
 	}
-	READ32(dummy); 	/* slotid must be 0 */
+
+	dummy = be32_to_cpup(p++);
 	if (dummy != 0) {
-		dprintk("%s Invalid slotid\n", __func__);
+		dprintk("NFS: %s Invalid slotid\n", __func__);
 		goto out;
 	}
-	/* FIXME: process highest slotid and target highest slotid */
+
+	/*
+	 * FIXME: process highest slotid and target highest slotid
+	 */
 	status = 0;
 out:
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	enum nfsstat4 nfserr;
+	int status;
+
+	if (cb->cb_minorversion == 0)
+		return 0;
 
-static int
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
+	status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
+	if (unlikely(status))
+		goto out;
+	if (unlikely(nfserr != NFS4_OK))
+		goto out_default;
+	status = decode_cb_sequence4resok(xdr, cb);
+out:
+	return status;
+out_default:
+	return nfs_cb_stat_to_errno(status);
+}
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				 void *__unused)
+{
+	xdr_reserve_space(xdr, 0);
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+				   const struct nfsd4_callback *cb)
+{
+	const struct nfs4_delegation *args = cb->cb_op;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_recall4args(xdr, args, &hdr);
+	encode_cb_nops(&hdr);
+}
+
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+				void *__unused)
 {
 	return 0;
 }
 
-static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-		struct nfsd4_callback *cb)
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
 {
-	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
+	enum nfsstat4 nfserr;
 	int status;
 
-	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-	status = decode_cb_compound_hdr(&xdr, &hdr);
-	if (status)
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
 		goto out;
-	if (cb) {
-		status = decode_cb_sequence(&xdr, cb, rqstp);
-		if (status)
+
+	if (cb != NULL) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status))
 			goto out;
 	}
-	status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
+
+	status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+	if (unlikely(status))
+		goto out;
+	if (unlikely(nfserr != NFS4_OK))
+		goto out_default;
 out:
 	return status;
+out_default:
+	return nfs_cb_stat_to_errno(status);
 }
 
 /*
  * RPC procedure tables
  */
-#define PROC(proc, call, argtype, restype)                              \
-[NFSPROC4_CLNT_##proc] = {                                      	\
-        .p_proc   = NFSPROC4_CB_##call,					\
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
-        .p_arglen = NFS4_##argtype##_sz,                                \
-        .p_replen = NFS4_##restype##_sz,                                \
-        .p_statidx = NFSPROC4_CB_##call,				\
-	.p_name   = #proc,                                              \
-}
-
-static struct rpc_procinfo     nfs4_cb_procedures[] = {
-    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
-    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
+#define PROC(proc, call, argtype, restype)				\
+[NFSPROC4_CLNT_##proc] = {						\
+	.p_proc    = NFSPROC4_CB_##call,				\
+	.p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,		\
+	.p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,		\
+	.p_arglen  = NFS4_enc_##argtype##_sz,				\
+	.p_replen  = NFS4_dec_##restype##_sz,				\
+	.p_statidx = NFSPROC4_CB_##call,				\
+	.p_name    = #proc,						\
+}
+
+static struct rpc_procinfo nfs4_cb_procedures[] = {
+	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
+	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
 };
 
-static struct rpc_version       nfs_cb_version4 = {
+static struct rpc_version nfs_cb_version4 = {
 /*
  * Note on the callback rpc program version number: despite language in rfc
  * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version       nfs_cb_version4 = {
  * in practice that appears to be what implementations use.  The section
  * 18.36.3 language is expected to be fixed in an erratum.
  */
-        .number                 = 1,
-        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
-        .procs                  = nfs4_cb_procedures
+	.number			= 1,
+	.nrprocs		= ARRAY_SIZE(nfs4_cb_procedures),
+	.procs			= nfs4_cb_procedures
 };
 
-static struct rpc_version *	nfs_cb_version[] = {
+static struct rpc_version *nfs_cb_version[] = {
 	&nfs_cb_version4,
 };
 
 static struct rpc_program cb_program;
 
 static struct rpc_stat cb_stats = {
-		.program	= &cb_program
+	.program		= &cb_program
 };
 
 #define NFS4_CALLBACK 0x40000000
 static struct rpc_program cb_program = {
-		.name 		= "nfs4_cb",
-		.number		= NFS4_CALLBACK,
-		.nrvers		= ARRAY_SIZE(nfs_cb_version),
-		.version	= nfs_cb_version,
-		.stats		= &cb_stats,
-		.pipe_dir_name  = "/nfsd4_cb",
+	.name			= "nfs4_cb",
+	.number			= NFS4_CALLBACK,
+	.nrvers			= ARRAY_SIZE(nfs_cb_version),
+	.version		= nfs_cb_version,
+	.stats			= &cb_stats,
+	.pipe_dir_name		= "/nfsd4_cb",
 };
 
 static int max_cb_time(void)
@@ -470,10 +628,8 @@ static int max_cb_time(void)
 	return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
 
-/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cl_cb_set an atomic? */
 
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
@@ -483,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 		.net		= &init_net,
 		.address	= (struct sockaddr *) &conn->cb_addr,
 		.addrsize	= conn->cb_addrlen,
+		.saddress	= (struct sockaddr *) &conn->cb_saddr,
 		.timeout	= &timeparms,
 		.program	= &cb_program,
 		.version	= 0,
@@ -499,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 		args.protocol = XPRT_TRANSPORT_TCP;
 		clp->cl_cb_ident = conn->cb_ident;
 	} else {
+		if (!conn->cb_xprt)
+			return -EINVAL;
+		clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+		clp->cl_cb_session = ses;
 		args.bc_xprt = conn->cb_xprt;
 		args.prognumber = clp->cl_cb_session->se_cb_prog;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -521,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 		(int)clp->cl_name.len, clp->cl_name.data, reason);
 }
 
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_DOWN;
+	warn_no_callback_path(clp, reason);
+}
+
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
-		warn_no_callback_path(clp, task->tk_status);
+		nfsd4_mark_cb_down(clp, task->tk_status);
 	else
-		atomic_set(&clp->cl_cb_set, 1);
+		clp->cl_cb_state = NFSD4_CB_UP;
 }
 
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -551,6 +718,11 @@ int set_callback_cred(void)
 
 static struct workqueue_struct *callback_wq;
 
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
+{
+	queue_work(callback_wq, &cb->cb_work);
+}
+
 static void do_probe_callback(struct nfs4_client *clp)
 {
 	struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -565,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
 
 	cb->cb_ops = &nfsd4_cb_probe_ops;
 
-	queue_work(callback_wq, &cb->cb_work);
+	run_nfsd4_cb(cb);
 }
 
 /*
@@ -574,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
  */
 void nfsd4_probe_callback(struct nfs4_client *clp)
 {
+	/* XXX: atomicity?  Also, should we be using cl_cb_flags? */
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
 	do_probe_callback(clp);
 }
 
-void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
 {
-	BUG_ON(atomic_read(&clp->cl_cb_set));
+	nfsd4_probe_callback(clp);
+	flush_workqueue(callback_wq);
+}
 
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	spin_lock(&clp->cl_lock);
 	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
 	spin_unlock(&clp->cl_lock);
@@ -592,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
  * If the slot is available, then mark it busy.  Otherwise, set the
  * thread for sleeping on the callback RPC wait queue.
  */
-static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
-		struct rpc_task *task)
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
 {
-	u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
-	int status = 0;
-
-	dprintk("%s: %u:%u:%u:%u\n", __func__,
-		ptr[0], ptr[1], ptr[2], ptr[3]);
-
 	if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
 		rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
 		dprintk("%s slot is busy\n", __func__);
-		status = -EAGAIN;
-		goto out;
+		return false;
 	}
-out:
-	dprintk("%s status=%d\n", __func__, status);
-	return status;
+	return true;
 }
 
 /*
@@ -622,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 	u32 minorversion = clp->cl_minorversion;
-	int status = 0;
 
 	cb->cb_minorversion = minorversion;
 	if (minorversion) {
-		status = nfsd41_cb_setup_sequence(clp, task);
-		if (status) {
-			if (status != -EAGAIN) {
-				/* terminate rpc task */
-				task->tk_status = status;
-				task->tk_action = NULL;
-			}
+		if (!nfsd41_cb_get_slot(clp, task))
 			return;
-		}
 	}
+	spin_lock(&clp->cl_lock);
+	if (list_empty(&cb->cb_per_client)) {
+		/* This is the first call, not a restart */
+		cb->cb_done = false;
+		list_add(&cb->cb_per_client, &clp->cl_callbacks);
+	}
+	spin_unlock(&clp->cl_lock);
 	rpc_call_start(task);
 }
 
@@ -671,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 
 	nfsd4_cb_done(task, calldata);
 
-	if (current_rpc_client == NULL) {
-		/* We're shutting down; give up. */
-		/* XXX: err, or is it ok just to fall through
-		 * and rpc_restart_call? */
+	if (current_rpc_client != task->tk_client) {
+		/* We're shutting down or changing cl_cb_client; leave
+		 * it to nfsd4_process_cb_update to restart the call if
+		 * necessary. */
 		return;
 	}
 
+	if (cb->cb_done)
+		return;
 	switch (task->tk_status) {
 	case 0:
+		cb->cb_done = true;
 		return;
 	case -EBADHANDLE:
 	case -NFS4ERR_BAD_STATEID:
@@ -688,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		break;
 	default:
 		/* Network partition? */
-		atomic_set(&clp->cl_cb_set, 0);
-		warn_no_callback_path(clp, task->tk_status);
-		if (current_rpc_client != task->tk_client) {
-			/* queue a callback on the new connection: */
-			atomic_inc(&dp->dl_count);
-			nfsd4_cb_recall(dp);
-			return;
-		}
+		nfsd4_mark_cb_down(clp, task->tk_status);
 	}
 	if (dp->dl_retries--) {
 		rpc_delay(task, 2*HZ);
 		task->tk_status = 0;
 		rpc_restart_call_prepare(task);
 		return;
-	} else {
-		atomic_set(&clp->cl_cb_set, 0);
-		warn_no_callback_path(clp, task->tk_status);
 	}
+	nfsd4_mark_cb_down(clp, task->tk_status);
+	cb->cb_done = true;
 }
 
 static void nfsd4_cb_recall_release(void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
+	struct nfs4_client *clp = cb->cb_clp;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 
-	nfs4_put_delegation(dp);
+	if (cb->cb_done) {
+		spin_lock(&clp->cl_lock);
+		list_del(&cb->cb_per_client);
+		spin_unlock(&clp->cl_lock);
+		nfs4_put_delegation(dp);
+	}
 }
 
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -748,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
 	flush_workqueue(callback_wq);
 }
 
-void nfsd4_release_cb(struct nfsd4_callback *cb)
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
 {
 	if (cb->cb_ops->rpc_release)
 		cb->cb_ops->rpc_release(cb);
 }
 
-void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+	struct nfsd4_session *s;
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+		list_for_each_entry(c, &s->se_conns, cn_persession) {
+			if (c->cn_flags & NFS4_CDFC4_BACK)
+				return c;
+		}
+	}
+	return NULL;
+}
+
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 {
 	struct nfs4_cb_conn conn;
 	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd4_session *ses = NULL;
+	struct nfsd4_conn *c;
 	int err;
 
 	/*
@@ -768,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
 	}
+	if (clp->cl_cb_conn.cb_xprt) {
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+		clp->cl_cb_conn.cb_xprt = NULL;
+	}
 	if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
 		return;
 	spin_lock(&clp->cl_lock);
@@ -778,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	BUG_ON(!clp->cl_cb_flags);
 	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
 	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+	c = __nfsd4_find_backchannel(clp);
+	if (c) {
+		svc_xprt_get(c->cn_xprt);
+		conn.cb_xprt = c->cn_xprt;
+		ses = c->cn_session;
+	}
 	spin_unlock(&clp->cl_lock);
 
-	err = setup_callback_client(clp, &conn);
-	if (err)
+	err = setup_callback_client(clp, &conn, ses);
+	if (err) {
 		warn_no_callback_path(clp, err);
+		return;
+	}
+	/* Yay, the callback channel's back! Restart any callbacks: */
+	list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+		run_nfsd4_cb(cb);
 }
 
 void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -807,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfsd4_callback *cb = &dp->dl_recall;
+	struct nfs4_client *clp = dp->dl_client;
 
 	dp->dl_retries = 1;
 	cb->cb_op = dp;
-	cb->cb_clp = dp->dl_client;
+	cb->cb_clp = clp;
 	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
 	cb->cb_msg.rpc_argp = cb;
 	cb->cb_msg.rpc_resp = cb;
@@ -819,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
 	cb->cb_ops = &nfsd4_cb_recall_ops;
 	dp->dl_retries = 1;
 
-	queue_work(callback_wq, &dp->dl_recall.cb_work);
+	INIT_LIST_HEAD(&cb->cb_per_client);
+	cb->cb_done = true;
+
+	run_nfsd4_cb(&dp->dl_recall);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0..6d2c397d458 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
  */
 
 #include <linux/module.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "idmap.h"
+#include "nfsd.h"
 
 /*
  * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
 	return clp->name;
 }
 
-static int
+static __be32
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
 		uid_t *id)
 {
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 	int ret;
 
 	if (namelen + 1 > sizeof(key.name))
-		return -EINVAL;
+		return nfserr_badowner;
 	memcpy(key.name, name, namelen);
 	key.name[namelen] = '\0';
 	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
 	if (ret == -ENOENT)
-		ret = -ESRCH; /* nfserr_badname */
+		return nfserr_badowner;
 	if (ret)
-		return ret;
+		return nfserrno(ret);
 	*id = item->id;
 	cache_put(&item->h, &nametoid_cache);
 	return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 	return ret;
 }
 
-int
+__be32
 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 		__u32 *id)
 {
 	return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
 }
 
-int
+__be32
 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 		__u32 *id)
 {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7..db52546143d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status;
 }
 
-static __be32
-nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-	      void *arg)
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
 {
 	struct svc_fh tmp_fh;
 	__be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	ret = exp_pseudoroot(rqstp, &tmp_fh);
 	if (ret)
 		return ret;
-	if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
+	if (tmp_fh.fh_dentry == fh->fh_dentry) {
 		fh_put(&tmp_fh);
 		return nfserr_noent;
 	}
 	fh_put(&tmp_fh);
-	return nfsd_lookup(rqstp, &cstate->current_fh,
-			   "..", 2, &cstate->current_fh);
+	return nfsd_lookup(rqstp, fh, "..", 2, fh);
+}
+
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      void *arg)
+{
+	return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
 }
 
 static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	} else
 		secinfo->si_exp = exp;
 	dput(dentry);
+	if (cstate->minorversion)
+		/* See rfc 5661 section 2.6.3.1.1.8 */
+		fh_put(&cstate->current_fh);
 	return err;
 }
 
 static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo_no_name *sin)
+{
+	__be32 err;
+
+	switch (sin->sin_style) {
+	case NFS4_SECINFO_STYLE4_CURRENT_FH:
+		break;
+	case NFS4_SECINFO_STYLE4_PARENT:
+		err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+		if (err)
+			return err;
+		break;
+	default:
+		return nfserr_inval;
+	}
+	exp_get(cstate->current_fh.fh_export);
+	sin->sin_exp = cstate->current_fh.fh_export;
+	fh_put(&cstate->current_fh);
+	return nfs_ok;
+}
+
+static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	      struct nfsd4_setattr *setattr)
 {
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
  * Also note, enforced elsewhere:
  *	- SEQUENCE other than as first op results in
  *	  NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- *	- BIND_CONN_TO_SESSION must be the only op in its compound
- *	  (Will be enforced in nfsd4_bind_conn_to_session().)
+ *	- BIND_CONN_TO_SESSION must be the only op in its compound.
+ *	  (Enforced in nfsd4_bind_conn_to_session().)
  *	- DESTROY_SESSION must be the final operation in a compound, if
  *	  sessionid's in SEQUENCE and DESTROY_SESSION are the same.
  *	  (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
 
 		nfsd4_increment_op_stats(op->opnum);
 	}
-	if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
-		dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
-		status = nfserr_jukebox;
-	}
 
 	resp->cstate.status = status;
 	fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_EXCHANGE_ID",
 	},
+	[OP_BIND_CONN_TO_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_BIND_CONN_TO_SESSION",
+	},
 	[OP_CREATE_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_create_session,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH,
 		.op_name = "OP_RECLAIM_COMPLETE",
 	},
+	[OP_SECINFO_NO_NAME] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+		.op_name = "OP_SECINFO_NO_NAME",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a2..ffb59ef6f82 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
 	int status;
 
-	/* note: we currently use this path only for minorversion 0 */
 	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbd18c3074b..d98d0213285 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_client = clp;
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
-	nfs4_file_get_access(fp, O_RDONLY);
+	dp->dl_vfs_file = find_readable_file(fp);
+	get_file(dp->dl_vfs_file);
 	dp->dl_flock = NULL;
 	dp->dl_type = type;
 	dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 	if (atomic_dec_and_test(&dp->dl_count)) {
 		dprintk("NFSD: freeing dp %p\n",dp);
 		put_nfs4_file(dp->dl_file);
+		fput(dp->dl_vfs_file);
 		kmem_cache_free(deleg_slab, dp);
 		num_delegations--;
 	}
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-	struct file *filp = find_readable_file(dp->dl_file);
-
 	dprintk("NFSD: close_delegation dp %p\n",dp);
+	/* XXX: do we even need this check?: */
 	if (dp->dl_flock)
-		vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
-	nfs4_file_put_access(dp->dl_file, O_RDONLY);
+		vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
 }
 
 /* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 		free_conn(c);
 	}
 	spin_unlock(&clp->cl_lock);
+	nfsd4_probe_callback(clp);
 }
 
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
 	return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
 
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
 {
 	struct nfsd4_conn *conn;
-	u32 flags = NFS4_CDFC4_FORE;
 	int ret;
 
-	if (ses->se_flags & SESSION4_BACK_CHAN)
-		flags |= NFS4_CDFC4_BACK;
-	conn = alloc_conn(rqstp, flags);
+	conn = alloc_conn(rqstp, dir);
 	if (!conn)
 		return nfserr_jukebox;
 	nfsd4_hash_conn(conn, ses);
@@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 	return nfs_ok;
 }
 
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	u32 dir = NFS4_CDFC4_FORE;
+
+	if (ses->se_flags & SESSION4_BACK_CHAN)
+		dir |= NFS4_CDFC4_BACK;
+
+	return nfsd4_new_conn(rqstp, ses, dir);
+}
+
+/* must be called under client_lock */
 static void nfsd4_del_conns(struct nfsd4_session *s)
 {
 	struct nfs4_client *clp = s->se_client;
@@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 	 */
 	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
 	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+	if (numslots < 1)
+		return NULL;
 
 	new = alloc_session(slotsize, numslots);
 	if (!new) {
@@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&client_lock);
 	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	spin_lock(&clp->cl_lock);
 	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&clp->cl_lock);
 	spin_unlock(&client_lock);
 
-	status = nfsd4_new_conn(rqstp, new);
+	status = nfsd4_new_conn_from_crses(rqstp, new);
 	/* whoops: benny points out, status is ignored! (err, or bogus) */
 	if (status) {
 		free_session(&new->se_ref);
 		return NULL;
 	}
-	if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+	if (cses->flags & SESSION4_BACK_CHAN) {
 		struct sockaddr *sa = svc_addr(rqstp);
-
-		clp->cl_cb_session = new;
-		clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-		svc_xprt_get(rqstp->rq_xprt);
+		/*
+		 * This is a little silly; with sessions there's no real
+		 * use for the callback address.  Use the peer address
+		 * as a reasonable default for now, but consider fixing
+		 * the rpc client not to require an address in the
+		 * future:
+		 */
 		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
 		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-		nfsd4_probe_callback(clp);
 	}
+	nfsd4_probe_callback(clp);
 	return new;
 }
 
@@ -817,7 +833,9 @@ static void
 unhash_session(struct nfsd4_session *ses)
 {
 	list_del(&ses->se_hash);
+	spin_lock(&ses->se_client->cl_lock);
 	list_del(&ses->se_perclnt);
+	spin_unlock(&ses->se_client->cl_lock);
 }
 
 /* must be called under the client_lock */
@@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
 
 	mark_client_expired(clp);
 	list_del(&clp->cl_lru);
+	spin_lock(&clp->cl_lock);
 	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
 		list_del_init(&ses->se_hash);
+	spin_unlock(&clp->cl_lock);
 }
 
 static void
@@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_refcount, 0);
-	atomic_set(&clp->cl_cb_set, 0);
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	INIT_LIST_HEAD(&clp->cl_callbacks);
 	spin_lock_init(&clp->cl_lock);
 	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
 	clp->cl_time = get_seconds();
@@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
 	return NULL;
 }
 
-/*
- * Return 1 iff clp's clientid establishment method matches the use_exchange_id
- * parameter. Matching is based on the fact the at least one of the
- * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
- *
- * FIXME: we need to unify the clientid namespaces for nfsv4.x
- * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
- * and SET_CLIENTID{,_CONFIRM}
- */
-static inline int
-match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+static bool clp_used_exchangeid(struct nfs4_client *clp)
 {
-	bool has_exchange_flags = (clp->cl_exchange_flags != 0);
-	return use_exchange_id == has_exchange_flags;
-}
+	return clp->cl_exchange_flags != 0;
+} 
 
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval,
-			     bool use_exchange_id)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname) &&
-		    match_clientid_establishment(clp, use_exchange_id))
+		if (same_name(clp->cl_recdir, dname))
 			return clp;
 	}
 	return NULL;
 }
 
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
-			       bool use_exchange_id)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname) &&
-		    match_clientid_establishment(clp, use_exchange_id))
+		if (same_name(clp->cl_recdir, dname))
 			return clp;
 	}
 	return NULL;
 }
 
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+	switch (family) {
+	case AF_INET:
+		((struct sockaddr_in *)sa)->sin_family = AF_INET;
+		((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+		return;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+		((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+		return;
+	}
+}
+
 static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
 	struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+	struct sockaddr	*sa = svc_addr(rqstp);
+	u32 scopeid = rpc_get_scope_id(sa);
 	unsigned short expected_family;
 
 	/* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 
 	conn->cb_prog = se->se_callback_prog;
 	conn->cb_ident = se->se_callback_ident;
+	rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
 	return;
 out_err:
 	conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	case SP4_NONE:
 		break;
 	case SP4_SSV:
-		return nfserr_encr_alg_unsupp;
+		return nfserr_serverfault;
 	default:
 		BUG();				/* checked by xdr code */
 	case SP4_MACH_CRED:
@@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	nfs4_lock_state();
 	status = nfs_ok;
 
-	conf = find_confirmed_client_by_str(dname, strhashval, true);
+	conf = find_confirmed_client_by_str(dname, strhashval);
 	if (conf) {
+		if (!clp_used_exchangeid(conf)) {
+			status = nfserr_clid_inuse; /* XXX: ? */
+			goto out;
+		}
 		if (!same_verf(&verf, &conf->cl_verifier)) {
 			/* 18.35.4 case 8 */
 			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 		goto out;
 	}
 
-	unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval);
 	if (unconf) {
 		/*
 		 * Possible retry or client restart.  Per 18.35.4 case 4,
@@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	status = nfs_ok;
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
+	memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+		sizeof(struct nfsd4_channel_attrs));
 	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
@@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
 	return argp->opcnt == resp->opcnt;
 }
 
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+	switch (*dir) {
+	case NFS4_CDFC4_FORE:
+	case NFS4_CDFC4_BACK:
+		return nfs_ok;
+	case NFS4_CDFC4_FORE_OR_BOTH:
+	case NFS4_CDFC4_BACK_OR_BOTH:
+		*dir = NFS4_CDFC4_BOTH;
+		return nfs_ok;
+	};
+	return nfserr_inval;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 status;
+
+	if (!nfsd4_last_compound_op(rqstp))
+		return nfserr_not_only_op;
+	spin_lock(&client_lock);
+	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+	/* Sorta weird: we only need the refcnt'ing because new_conn acquires
+	 * client_lock iself: */
+	if (cstate->session) {
+		nfsd4_get_session(cstate->session);
+		atomic_inc(&cstate->session->se_client->cl_refcount);
+	}
+	spin_unlock(&client_lock);
+	if (!cstate->session)
+		return nfserr_badsession;
+
+	status = nfsd4_map_bcts_dir(&bcts->dir);
+	nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+	return nfs_ok;
+}
+
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
 	if (!session)
@@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
 	spin_unlock(&client_lock);
 
 	nfs4_lock_state();
-	/* wait for callbacks */
-	nfsd4_shutdown_callback(ses->se_client);
+	nfsd4_probe_callback_sync(ses->se_client);
 	nfs4_unlock_state();
 
 	nfsd4_del_conns(ses);
@@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 out:
 	/* Hold a session reference until done processing the compound. */
 	if (cstate->session) {
+		struct nfs4_client *clp = session->se_client;
+
 		nfsd4_get_session(cstate->session);
-		atomic_inc(&session->se_client->cl_refcount);
+		atomic_inc(&clp->cl_refcount);
+		if (clp->cl_cb_state == NFSD4_CB_DOWN)
+			seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
 	}
 	kfree(conn);
 	spin_unlock(&client_lock);
@@ -1775,7 +1846,6 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
 {
-	struct sockaddr		*sa = svc_addr(rqstp);
 	struct xdr_netobj 	clname = { 
 		.len = setclid->se_namelen,
 		.data = setclid->se_name,
@@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	strhashval = clientstr_hashval(dname);
 
 	nfs4_lock_state();
-	conf = find_confirmed_client_by_str(dname, strhashval, false);
+	conf = find_confirmed_client_by_str(dname, strhashval);
 	if (conf) {
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
+		if (clp_used_exchangeid(conf))
+			goto out;
 		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
 			char addr_str[INET6_ADDRSTRLEN];
 			rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * has a description of SETCLIENTID request processing consisting
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
-	unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
+	unconf = find_unconfirmed_client_by_str(dname, strhashval);
 	status = nfserr_resource;
 	if (!conf) {
 		/*
@@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * for consistent minorversion use throughout:
 	 */
 	new->cl_minorversion = 0;
-	gen_callback(new, setclid, rpc_get_scope_id(sa));
+	gen_callback(new, setclid, rqstp);
 	add_to_unconfirmed(new, strhashval);
 	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
 	setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
 			status = nfserr_clid_inuse;
 		else {
-			atomic_set(&conf->cl_cb_set, 0);
 			nfsd4_change_callback(conf, &unconf->cl_cb_conn);
 			nfsd4_probe_callback(conf);
 			expire_client(unconf);
@@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			unsigned int hash =
 				clientstr_hashval(unconf->cl_recdir);
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
-							    hash, false);
+							    hash);
 			if (conf) {
 				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
@@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 	nfsd4_cb_recall(dp);
 }
 
-/*
- * The file_lock is being reapd.
- *
- * Called by locks_free_lock() with lock_flocks() held.
- */
-static
-void nfsd_release_deleg_cb(struct file_lock *fl)
-{
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-
-	dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
-
-	if (!(fl->fl_flags & FL_LEASE) || !dp)
-		return;
-	dp->dl_flock = NULL;
-}
-
-/*
- * Called from setlease() with lock_flocks() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
-	struct nfs4_delegation *onlistd =
-		(struct nfs4_delegation *)onlist->fl_owner;
-	struct nfs4_delegation *tryd =
-		(struct nfs4_delegation *)try->fl_owner;
-
-	if (onlist->fl_lmops != try->fl_lmops)
-		return 0;
-
-	return onlistd->dl_client == tryd->dl_client;
-}
-
-
 static
 int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 {
@@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
-	.fl_release_private = nfsd_release_deleg_cb,
-	.fl_mylease = nfsd_same_client_deleg_cb,
 	.fl_change = nfsd_change_deleg_cb,
 };
 
@@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
 	if (!fp->fi_fds[oflag]) {
 		status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
 			&fp->fi_fds[oflag]);
-		if (status == nfserr_dropit)
-			status = nfserr_jukebox;
 		if (status)
 			return status;
 	}
@@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
 	open->op_stateowner->so_client->cl_firststate = 1;
 }
 
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+	if (clp->cl_cb_state == NFSD4_CB_UP)
+		return true;
+	/*
+	 * In the sessions case, since we don't have to establish a
+	 * separate connection for callbacks, we assume it's OK
+	 * until we hear otherwise:
+	 */
+	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
+
 /*
  * Attempt to hand out a delegation.
  */
@@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	int cb_up = atomic_read(&sop->so_client->cl_cb_set);
+	int cb_up;
 	struct file_lock *fl;
 	int status, flag = 0;
 
+	cb_up = nfsd4_cb_channel_good(sop->so_client);
 	flag = NFS4_OPEN_DELEGATE_NONE;
 	open->op_recall = 0;
 	switch (open->op_claim_type) {
@@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	dp->dl_flock = fl;
 
 	/* vfs_setlease checks to see if delegation should be handed out.
-	 * the lock_manager callbacks fl_mylease and fl_change are used
+	 * the lock_manager callback fl_change is used
 	 */
 	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
-			&& !atomic_read(&clp->cl_cb_set))
+			&& clp->cl_cb_state != NFSD4_CB_UP)
 		goto out;
 	status = nfs_ok;
 out:
@@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		if (status)
 			goto out;
 		renew_client(dp->dl_client);
-		if (filpp)
+		if (filpp) {
 			*filpp = find_readable_file(dp->dl_file);
-		BUG_ON(!*filpp);
+			BUG_ON(!*filpp);
+		}
 	} else { /* open or lock stateid */
 		stp = find_stateid(stateid, flags);
 		if (!stp)
@@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 	unsigned int strhashval = clientstr_hashval(name);
 	struct nfs4_client *clp;
 
-	clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
+	clp = find_confirmed_client_by_str(name, strhashval);
 	return clp ? 1 : 0;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a0402..956629b9cdc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
 #include <linux/sunrpc/svcauth_gss.h>
 
+#include "idmap.h"
+#include "acl.h"
 #include "xdr4.h"
 #include "vfs.h"
 
+
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
 /*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			len += XDR_QUADLEN(dummy32) << 2;
 			READMEM(buf, dummy32);
 			ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
-			host_err = 0;
+			status = nfs_ok;
 			if (ace->whotype != NFS4_ACL_WHO_NAMED)
 				ace->who = 0;
 			else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-				host_err = nfsd_map_name_to_gid(argp->rqstp,
+				status = nfsd_map_name_to_gid(argp->rqstp,
 						buf, dummy32, &ace->who);
 			else
-				host_err = nfsd_map_name_to_uid(argp->rqstp,
+				status = nfsd_map_name_to_uid(argp->rqstp,
 						buf, dummy32, &ace->who);
-			if (host_err)
-				goto out_nfserr;
+			if (status)
+				return status;
 		}
 	} else
 		*acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+	DECODE_HEAD;
+	u32 dummy;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+	COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(bcts->dir);
+	/* XXX: Perhaps Tom Tucker could help us figure out how we
+	 * should be using ctsa_use_conn_in_rdma_mode: */
+	READ32(dummy);
+
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 }
 
 static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo_no_name *sin)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(sin->sin_style);
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
 	__be32 status;
@@ -1005,7 +1032,7 @@ static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 			 struct nfsd4_exchange_id *exid)
 {
-	int dummy;
+	int dummy, tmp;
 	DECODE_HEAD;
 
 	READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 
 		/* ssp_hash_algs<> */
 		READ_BUF(4);
-		READ32(dummy);
-		READ_BUF(dummy);
-		p += XDR_QUADLEN(dummy);
+		READ32(tmp);
+		while (tmp--) {
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
 
 		/* ssp_encr_algs<> */
 		READ_BUF(4);
-		READ32(dummy);
-		READ_BUF(dummy);
-		p += XDR_QUADLEN(dummy);
+		READ32(tmp);
+		while (tmp--) {
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+		}
 
 		/* ssp_window and ssp_num_gss_handles */
 		READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 
 	/* new operations for NFSv4.1 */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session,
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	case nfserr_resource:
 		nfserr = nfserr_toosmall;
 		goto fail;
-	case nfserr_dropit:
-		goto fail;
 	case nfserr_noent:
 		goto skip_entry;
 	default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 	return nfserr;
 }
 
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+		WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+		WRITE32(bcts->dir);
+		/* XXX: ? */
+		WRITE32(0);
+		ADJUST_ARGS();
+	}
+	return nfserr;
+}
+
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 
 static __be32
-nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-		     struct nfsd4_secinfo *secinfo)
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
+			 __be32 nfserr,struct svc_export *exp)
 {
 	int i = 0;
-	struct svc_export *exp = secinfo->si_exp;
 	u32 nflavs;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
 	return nfserr;
 }
 
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo *secinfo)
+{
+	return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     struct nfsd4_secinfo_no_name *secinfo)
+{
+	return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
+
 /*
  * The SETATTR encode routine is special -- it always encodes a bitmap,
  * regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 	WRITE32(seq->seqid);
 	WRITE32(seq->slotid);
 	WRITE32(seq->maxslots);
-	/*
-	 * FIXME: for now:
-	 *   target_maxslots = maxslots
-	 *   status_flags = 0
-	 */
+	/* For now: target_maxslots = maxslots */
 	WRITE32(seq->maxslots);
-	WRITE32(0);
+	WRITE32(seq->status_flags);
 
 	ADJUST_ARGS();
 	resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 
 	/* NFSv4.1 operations */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d..33b3e2b0677 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
 
-#include <linux/nfsd_idmap.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 
+#include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
 
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+#ifdef CONFIG_NFSD_DEPRECATED
 	static int warned;
 	if (file->f_dentry->d_name.name[0] == '.' && !warned) {
 		printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
 		       current->comm, file->f_dentry->d_name.name);
 		warned = 1;
 	}
+#endif
 	if (! file->private_data) {
 		/* An attempt to read a transaction file without writing
 		 * causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19..7ecfa242030 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_attrnotsupp	cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define	nfserr_bad_xdr		cpu_to_be32(NFSERR_BAD_XDR)
 #define	nfserr_openmode		cpu_to_be32(NFSERR_OPENMODE)
+#define	nfserr_badowner		cpu_to_be32(NFSERR_BADOWNER)
 #define	nfserr_locks_held	cpu_to_be32(NFSERR_LOCKS_HELD)
 #define	nfserr_op_illegal	cpu_to_be32(NFSERR_OP_ILLEGAL)
 #define	nfserr_grace		cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784..e15dc45fc5e 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
 		{ nfserr_stale, -ESTALE },
 		{ nfserr_jukebox, -ETIMEDOUT },
 		{ nfserr_jukebox, -ERESTARTSYS },
-		{ nfserr_dropit, -EAGAIN },
-		{ nfserr_dropit, -ENOMEM },
-		{ nfserr_badname, -ESRCH },
+		{ nfserr_jukebox, -EAGAIN },
+		{ nfserr_jukebox, -EWOULDBLOCK },
+		{ nfserr_jukebox, -ENOMEM },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
 		{ nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f..18743c4d8bc 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-	if (nfserr == nfserr_dropit) {
+	if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
 		dprintk("nfsd: Dropping request; may be revisited later\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
 		return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b068..3074656ba7b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
 struct nfsd4_callback {
 	void *cb_op;
 	struct nfs4_client *cb_clp;
+	struct list_head cb_per_client;
 	u32 cb_minorversion;
 	struct rpc_message cb_msg;
 	const struct rpc_call_ops *cb_ops;
 	struct work_struct cb_work;
+	bool cb_done;
 };
 
 struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
 	atomic_t		dl_count;       /* ref count */
 	struct nfs4_client	*dl_client;
 	struct nfs4_file	*dl_file;
+	struct file		*dl_vfs_file;
 	struct file_lock	*dl_flock;
 	u32			dl_type;
 	time_t			dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
 	struct sockaddr_storage	cb_addr;
+	struct sockaddr_storage	cb_saddr;
 	size_t			cb_addrlen;
 	u32                     cb_prog; /* used only in 4.0 case;
 					    per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
 	u32				gid;
 };
 
+struct nfsd4_bind_conn_to_session {
+	struct nfs4_sessionid		sessionid;
+	u32				dir;
+};
+
 /* The single slot clientid cache structure */
 struct nfsd4_clid_slot {
 	u32				sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
 	unsigned long		cl_cb_flags;
 	struct rpc_clnt		*cl_cb_client;
 	u32			cl_cb_ident;
-	atomic_t		cl_cb_set;
+#define NFSD4_CB_UP		0
+#define NFSD4_CB_UNKNOWN	1
+#define NFSD4_CB_DOWN		2
+	int			cl_cb_state;
 	struct nfsd4_callback	cl_cb_null;
 	struct nfsd4_session	*cl_cb_session;
+	struct list_head	cl_callbacks; /* list of in-progress callbacks */
 
 	/* for all client information that callback code might need: */
 	spinlock_t		cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3a359023c9f..641117f2188 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS	/* HACK HACK */
 /*
  * File operations used by nfsd. Some of these have been ripped from
  * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
 #endif /* CONFIG_NFSD_V3 */
 
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
-#include <linux/nfsd_idmap.h>
+#include "acl.h"
+#include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 
 #include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (d_mountpoint(path.dentry) && follow_down(&path))
-		;
+	err = follow_down(&path, false);
+	if (err < 0)
+		goto out;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
 	return err;
 }
 
+static int nfsd_break_lease(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
+
 /*
  * Commit metadata changes to stable storage.
  */
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 				goto out;
 		}
 
-		/*
-		 * If we are changing the size of the file, then
-		 * we need to break all leases.
-		 */
-		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
-		if (host_err == -EWOULDBLOCK)
-			host_err = -ETIMEDOUT;
-		if (host_err) /* ENOMEM or EWOULDBLOCK */
-			goto out_nfserr;
-
 		host_err = get_write_access(inode);
 		if (host_err)
 			goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 
 	err = nfserr_notsync;
 	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+		host_err = nfsd_break_lease(inode);
+		if (host_err)
+			goto out_nfserr;
 		fh_lock(fhp);
+
 		host_err = notify_change(dentry, iap);
 		err = nfserrno(host_err);
 		fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 */
 	if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
 		host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
-	if (host_err == -EWOULDBLOCK)
-		host_err = -ETIMEDOUT;
 	if (host_err) /* NOMEM or WOULDBLOCK */
 		goto out_nfserr;
 
@@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
 	struct page *page = buf->page;
 	size_t size;
-	int ret;
-
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
 
 	size = sd->len;
 
@@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
 	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-	return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-	return 0;
-#endif
-}
-
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 	inode = file->f_path.dentry->d_inode;
 
-	if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-		goto out;
-
 	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
 		struct splice_desc sd = {
 			.len		= 0,
@@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		fsnotify_access(file);
 	} else 
 		err = nfserrno(host_err);
-out:
 	return err;
 }
 
@@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	int			stable = *stablep;
 	int			use_wgather;
 
-#ifdef MSNFS
-	err = nfserr_perm;
-
-	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-		goto out;
-#endif
-
 	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
 	exp   = fhp->fh_export;
@@ -1050,7 +1023,6 @@ out_nfserr:
 		err = 0;
 	else
 		err = nfserrno(host_err);
-out:
 	return err;
 }
 
@@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		err = nfserrno(host_err);
 		goto out_dput;
 	}
+	err = nfserr_noent;
+	if (!dold->d_inode)
+		goto out_drop_write;
+	host_err = nfsd_break_lease(dold->d_inode);
+	if (host_err)
+		goto out_drop_write;
 	host_err = vfs_link(dold, dirp, dnew);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
@@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		else
 			err = nfserrno(host_err);
 	}
+out_drop_write:
 	mnt_drop_write(tfhp->fh_export->ex_path.mnt);
 out_dput:
 	dput(dnew);
@@ -1755,12 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (ndentry == trap)
 		goto out_dput_new;
 
-	if (svc_msnfs(ffhp) &&
-		((odentry->d_count > 1) || (ndentry->d_count > 1))) {
-			host_err = -EPERM;
-			goto out_dput_new;
-	}
-
 	host_err = -EXDEV;
 	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
 		goto out_dput_new;
@@ -1768,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (host_err)
 		goto out_dput_new;
 
+	host_err = nfsd_break_lease(odentry->d_inode);
+	if (host_err)
+		goto out_drop_write;
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!host_err) {
 		host_err = commit_metadata(tfhp);
 		if (!host_err)
 			host_err = commit_metadata(ffhp);
 	}
-
+out_drop_write:
 	mnt_drop_write(ffhp->fh_export->ex_path.mnt);
-
  out_dput_new:
 	dput(ndentry);
  out_dput_old:
@@ -1839,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (host_err)
 		goto out_nfserr;
 
-	if (type != S_IFDIR) { /* It's UNLINK */
-#ifdef MSNFS
-		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-			(rdentry->d_count > 1)) {
-			host_err = -EPERM;
-		} else
-#endif
+	host_err = nfsd_break_lease(rdentry->d_inode);
+	if (host_err)
+		goto out_put;
+	if (type != S_IFDIR)
 		host_err = vfs_unlink(dirp, rdentry);
-	} else { /* It's RMDIR */
+	else
 		host_err = vfs_rmdir(dirp, rdentry);
-	}
-
+out_put:
 	dput(rdentry);
 
 	if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb..366401e1a53 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
 	struct svc_export *si_exp;			/* response */
 };
 
+struct nfsd4_secinfo_no_name {
+	u32 sin_style;					/* request */
+	struct svc_export *sin_exp;			/* response */
+};
+
 struct nfsd4_setattr {
 	stateid_t	sa_stateid;         /* request */
 	u32		sa_bmval[3];        /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
 	u32			cachethis;		/* request */
 #if 0
 	u32			target_maxslots;	/* response */
-	u32			status_flags;		/* response */
 #endif /* not yet */
+	u32			status_flags;		/* response */
 };
 
 struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
 
 		/* NFSv4.1 */
 		struct nfsd4_exchange_id	exchange_id;
+		struct nfsd4_bind_conn_to_session bind_conn_to_session;
 		struct nfsd4_create_session	create_session;
 		struct nfsd4_destroy_session	destroy_session;
 		struct nfsd4_sequence		sequence;
@@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 		struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_create_session *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062ba..3ee67c67cc5 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
 
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+	return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+}
+
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+				     const char *fname, int err)
+{
+	struct inode *inode = bmap->b_inode;
+
+	if (err == -EINVAL) {
+		nilfs_error(inode->i_sb, fname,
+			    "broken bmap (inode number=%lu)\n", inode->i_ino);
+		err = -EIO;
+	}
+	return err;
 }
 
 /**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 
 	down_read(&bmap->b_sem);
 	ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
-	if (ret < 0)
+	if (ret < 0) {
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
 		goto out;
+	}
 	if (NILFS_BMAP_USE_VBN(bmap)) {
 		ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
 					  &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
 	down_read(&bmap->b_sem);
 	ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
 	up_read(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
 	down_write(&bmap->b_sem);
 	ret = nilfs_bmap_do_insert(bmap, key, rec);
 	up_write(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
 
 	down_read(&bmap->b_sem);
 	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
-	if (!ret)
-		*key = lastkey;
 	up_read(&bmap->b_sem);
+
+	if (ret < 0)
+		ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+	else
+		*key = lastkey;
 	return ret;
 }
 
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
 	down_write(&bmap->b_sem);
 	ret = nilfs_bmap_do_delete(bmap, key);
 	up_write(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
 	down_write(&bmap->b_sem);
 	ret = nilfs_bmap_do_truncate(bmap, key);
 	up_write(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 /**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
 	down_write(&bmap->b_sem);
 	ret = bmap->b_ops->bop_propagate(bmap, bh);
 	up_write(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 /**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
 	down_write(&bmap->b_sem);
 	ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
 	up_write(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 /**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 	down_write(&bmap->b_sem);
 	ret = bmap->b_ops->bop_mark(bmap, key, level);
 	up_write(&bmap->b_sem);
-	return ret;
+
+	return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
 /**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb74..388e9e8f528 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 	if (pblocknr == 0) {
 		pblocknr = blocknr;
 		if (inode->i_ino != NILFS_DAT_INO) {
-			struct inode *dat =
-				nilfs_dat_inode(NILFS_I_NILFS(inode));
+			struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
 
 			/* blocknr is a virtual block number */
 			err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f..9d45773b79e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
 			       unsigned from, unsigned to)
 {
 	struct inode *dir = mapping->host;
-	struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
 	loff_t pos = page_offset(page) + from;
 	unsigned len = to - from;
 	unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
 		i_size_write(dir, pos + copied);
 	if (IS_DIRSYNC(dir))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
-	err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+	err = nilfs_set_file_dirty(dir, nr_dirty);
 	WARN_ON(err); /* do not happen */
 	unlock_page(page);
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6f..2f560c9fb80 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
 	.truncate	= nilfs_truncate,
 	.setattr	= nilfs_setattr,
 	.permission     = nilfs_permission,
+	.fiemap		= nilfs_fiemap,
 };
 
 /* end of file */
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f9..bfc73d3a30e 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 	}
 
 	err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
-	if (unlikely(err)) {
-		if (err == -EINVAL)
-			nilfs_error(sb, __func__, "ifile is broken");
-		else
-			nilfs_warning(sb, __func__,
-				      "unable to read inode: %lu",
-				      (unsigned long) ino);
-	}
+	if (unlikely(err))
+		nilfs_warning(sb, __func__, "unable to read inode: %lu",
+			      (unsigned long) ino);
 	return err;
 }
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 77b48c8fab1..2fd440d8d6b 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	__u64 blknum = 0;
 	int err = 0, ret;
-	struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+	struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
 	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
 
 	down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 				       inode->i_ino,
 				       (unsigned long long)blkoff);
 				err = 0;
-			} else if (err == -EINVAL) {
-				nilfs_error(inode->i_sb, __func__,
-					    "broken bmap (inode=%lu)\n",
-					    inode->i_ino);
-				err = -EIO;
 			}
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
 		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
 						      to proper value */
 	} else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
 
 	if (ret) {
 		struct inode *inode = page->mapping->host;
-		struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
 		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
 
-		nilfs_set_file_dirty(sbi, inode, nr_dirty);
+		nilfs_set_file_dirty(inode, nr_dirty);
 	}
 	return ret;
 }
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 						  start + copied);
 	copied = generic_write_end(file, mapping, pos, len, copied, page,
 				   fsdata);
-	nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+	nilfs_set_file_dirty(inode, nr_dirty);
 	err = nilfs_transaction_commit(inode->i_sb);
 	return err ? : copied;
 }
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
 			      struct nilfs_root *root, unsigned long ino,
 			      struct inode *inode)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct buffer_head *bh;
 	struct nilfs_inode *raw_inode;
 	int err;
 
-	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
 	if (unlikely(err))
 		goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 	}
 	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
-	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
 	return 0;
 
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 	brelse(bh);
 
  bad_inode:
-	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	return err;
 }
 
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
 
 	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
 		return;
- repeat:
+repeat:
 	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
 	if (ret == -ENOENT)
 		return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
 		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
 		goto repeat;
 
- failed:
-	if (ret == -EINVAL)
-		nilfs_error(ii->vfs_inode.i_sb, __func__,
-			    "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
-	else
-		nilfs_warning(ii->vfs_inode.i_sb, __func__,
-			      "failed to truncate bmap (ino=%lu, err=%d)",
-			      ii->vfs_inode.i_ino, ret);
+failed:
+	nilfs_warning(ii->vfs_inode.i_sb, __func__,
+		      "failed to truncate bmap (ino=%lu, err=%d)",
+		      ii->vfs_inode.i_ino, ret);
 }
 
 void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 
 	nilfs_mark_inode_dirty(inode);
-	nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+	nilfs_set_file_dirty(inode, 0);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
 	   But truncate has no return value. */
@@ -800,9 +790,9 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 	return generic_permission(inode, mask, flags, NULL);
 }
 
-int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
-			   struct buffer_head **pbh)
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
+	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 
@@ -843,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
 	return ret;
 }
 
-int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
-			 unsigned nr_dirty)
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
+	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
 	atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -878,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
 	struct buffer_head *ibh;
 	int err;
 
-	err = nilfs_load_inode_block(sbi, inode, &ibh);
+	err = nilfs_load_inode_block(inode, &ibh);
 	if (unlikely(err)) {
 		nilfs_warning(inode->i_sb, __func__,
 			      "failed to reget inode block.\n");
@@ -924,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
 	nilfs_mark_inode_dirty(inode);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
+
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 __u64 start, __u64 len)
+{
+	struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+	__u64 logical = 0, phys = 0, size = 0;
+	__u32 flags = 0;
+	loff_t isize;
+	sector_t blkoff, end_blkoff;
+	sector_t delalloc_blkoff;
+	unsigned long delalloc_blklen;
+	unsigned int blkbits = inode->i_blkbits;
+	int ret, n;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+
+	blkoff = start >> blkbits;
+	end_blkoff = (start + len - 1) >> blkbits;
+
+	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+							&delalloc_blkoff);
+
+	do {
+		__u64 blkphy;
+		unsigned int maxblocks;
+
+		if (delalloc_blklen && blkoff == delalloc_blkoff) {
+			if (size) {
+				/* End of the current extent */
+				ret = fiemap_fill_next_extent(
+					fieinfo, logical, phys, size, flags);
+				if (ret)
+					break;
+			}
+			if (blkoff > end_blkoff)
+				break;
+
+			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+			logical = blkoff << blkbits;
+			phys = 0;
+			size = delalloc_blklen << blkbits;
+
+			blkoff = delalloc_blkoff + delalloc_blklen;
+			delalloc_blklen = nilfs_find_uncommitted_extent(
+				inode, blkoff, &delalloc_blkoff);
+			continue;
+		}
+
+		/*
+		 * Limit the number of blocks that we look up so as
+		 * not to get into the next delayed allocation extent.
+		 */
+		maxblocks = INT_MAX;
+		if (delalloc_blklen)
+			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+					  maxblocks);
+		blkphy = 0;
+
+		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+		n = nilfs_bmap_lookup_contig(
+			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+
+		if (n < 0) {
+			int past_eof;
+
+			if (unlikely(n != -ENOENT))
+				break; /* error */
+
+			/* HOLE */
+			blkoff++;
+			past_eof = ((blkoff << blkbits) >= isize);
+
+			if (size) {
+				/* End of the current extent */
+
+				if (past_eof)
+					flags |= FIEMAP_EXTENT_LAST;
+
+				ret = fiemap_fill_next_extent(
+					fieinfo, logical, phys, size, flags);
+				if (ret)
+					break;
+				size = 0;
+			}
+			if (blkoff > end_blkoff || past_eof)
+				break;
+		} else {
+			if (size) {
+				if (phys && blkphy << blkbits == phys + size) {
+					/* The current extent goes on */
+					size += n << blkbits;
+				} else {
+					/* Terminate the current extent */
+					ret = fiemap_fill_next_extent(
+						fieinfo, logical, phys, size,
+						flags);
+					if (ret || blkoff > end_blkoff)
+						break;
+
+					/* Start another extent */
+					flags = FIEMAP_EXTENT_MERGED;
+					logical = blkoff << blkbits;
+					phys = blkphy << blkbits;
+					size = n << blkbits;
+				}
+			} else {
+				/* Start a new extent */
+				flags = FIEMAP_EXTENT_MERGED;
+				logical = blkoff << blkbits;
+				phys = blkphy << blkbits;
+				size = n << blkbits;
+			}
+			blkoff += n;
+		}
+		cond_resched();
+	} while (true);
+
+	/* If ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b185e937a33..496738963fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	int ret;
 
 	down_read(&nilfs->ns_segctor_sem);
-	ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
+	ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
 	up_read(&nilfs->ns_segctor_sem);
 	return ret;
 }
@@ -242,8 +242,7 @@ static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
 			  void *buf, size_t size, size_t nmembs)
 {
-	struct inode *dat = nilfs_dat_inode(nilfs);
-	struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+	struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
 	struct nilfs_bdesc *bdescs = buf;
 	int ret, i;
 
@@ -421,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
 	size_t nmembs = argv->v_nmembs;
 	int ret;
 
-	ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+	ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
 
 	return (ret < 0) ? ret : nmembs;
 }
@@ -430,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
 					 struct nilfs_argv *argv, void *buf)
 {
 	size_t nmembs = argv->v_nmembs;
-	struct inode *dat = nilfs_dat_inode(nilfs);
-	struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+	struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
 	struct nilfs_bdesc *bdescs = buf;
 	int ret, i;
 
@@ -450,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
 			/* skip dead block */
 			continue;
 		if (bdescs[i].bd_level == 0) {
-			ret = nilfs_mdt_mark_block_dirty(dat,
+			ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
 							 bdescs[i].bd_offset);
 			if (ret < 0) {
 				WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9..6a0e2a189f6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
  *
  * %-ENOENT - the specified block does not exist (hole block)
  *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
- *
  * %-EROFS - Read only filesystem (for create mode)
  */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
  * %-ENOMEM - Insufficient memory available.
  *
  * %-EIO - I/O error
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
  */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
  * %-EIO - I/O error
  *
  * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
  */
 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 {
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 	struct buffer_head *bh_frozen;
 	struct page *page;
 	int blkbits = inode->i_blkbits;
-	int ret = -ENOMEM;
 
 	page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
 	if (!page)
-		return ret;
+		return -ENOMEM;
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << blkbits, 0);
 
 	bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
-	if (bh_frozen) {
-		if (!buffer_uptodate(bh_frozen))
-			nilfs_copy_buffer(bh_frozen, bh);
-		if (list_empty(&bh_frozen->b_assoc_buffers)) {
-			list_add_tail(&bh_frozen->b_assoc_buffers,
-				      &shadow->frozen_buffers);
-			set_buffer_nilfs_redirected(bh);
-		} else {
-			brelse(bh_frozen); /* already frozen */
-		}
-		ret = 0;
+
+	if (!buffer_uptodate(bh_frozen))
+		nilfs_copy_buffer(bh_frozen, bh);
+	if (list_empty(&bh_frozen->b_assoc_buffers)) {
+		list_add_tail(&bh_frozen->b_assoc_buffers,
+			      &shadow->frozen_buffers);
+		set_buffer_nilfs_redirected(bh);
+	} else {
+		brelse(bh_frozen); /* already frozen */
 	}
+
 	unlock_page(page);
 	page_cache_release(page);
-	return ret;
+	return 0;
 }
 
 struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf16..98034271cd0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
 	.rename		= nilfs_rename,
 	.setattr	= nilfs_setattr,
 	.permission	= nilfs_permission,
+	.fiemap		= nilfs_fiemap,
 };
 
 const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 0ca98823db5..777e8fd0430 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
 	return nilfs_test_transaction_flag(NILFS_TI_WRITER);
 }
 
-static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
-{
-	return nilfs->ns_dat;
-}
-
 /*
  * function prototype
  */
@@ -257,13 +252,13 @@ extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
 int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
-extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
-				  struct buffer_head **);
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
-extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
-				unsigned);
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 __u64 start, __u64 len);
 
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f..0c432416cfe 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	}
 	return nc;
 }
- 
+
 void nilfs_mapping_init_once(struct address_space *mapping)
 {
 	memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
 	}
 	return TestClearPageDirty(page);
 }
+
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+					    sector_t start_blk,
+					    sector_t *blkoff)
+{
+	unsigned int i;
+	pgoff_t index;
+	unsigned int nblocks_in_page;
+	unsigned long length = 0;
+	sector_t b;
+	struct pagevec pvec;
+	struct page *page;
+
+	if (inode->i_mapping->nrpages == 0)
+		return 0;
+
+	index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+repeat:
+	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+					pvec.pages);
+	if (pvec.nr == 0)
+		return length;
+
+	if (length > 0 && pvec.pages[0]->index > index)
+		goto out;
+
+	b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	i = 0;
+	do {
+		page = pvec.pages[i];
+
+		lock_page(page);
+		if (page_has_buffers(page)) {
+			struct buffer_head *bh, *head;
+
+			bh = head = page_buffers(page);
+			do {
+				if (b < start_blk)
+					continue;
+				if (buffer_delay(bh)) {
+					if (length == 0)
+						*blkoff = b;
+					length++;
+				} else if (length > 0) {
+					goto out_locked;
+				}
+			} while (++b, bh = bh->b_this_page, bh != head);
+		} else {
+			if (length > 0)
+				goto out_locked;
+
+			b += nblocks_in_page;
+		}
+		unlock_page(page);
+
+	} while (++i < pagevec_count(&pvec));
+
+	index = page->index + 1;
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+
+out_locked:
+	unlock_page(page);
+out:
+	pagevec_release(&pvec);
+	return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a203..622df27cd89 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+					    sector_t start_blk,
+					    sector_t *blkoff);
 
 #define NILFS_PAGE_BUG(page, m, a...) \
 	do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da..3dfcd3b7d38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 		if (unlikely(err))
 			goto failed_page;
 
-		err = nilfs_set_file_dirty(sbi, inode, 1);
+		err = nilfs_set_file_dirty(inode, 1);
 		if (unlikely(err))
 			goto failed_page;
 
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b98..7a17715f215 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 
-/*
- * Mount options
- */
-struct nilfs_mount_options {
-	unsigned long mount_opt;
-	__u64 snapshot_cno;
-};
-
 struct the_nilfs;
 struct nilfs_sc_info;
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea3..55ebae5c7f3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
 	return err;
 }
 
-static int nilfs_handle_bmap_error(int err, const char *fname,
-				   struct inode *inode, struct super_block *sb)
-{
-	if (err == -EINVAL) {
-		nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
-			    inode->i_ino);
-		err = -EIO;
-	}
-	return err;
-}
-
 /*
  * Callback functions that enumerate, mark, and collect dirty blocks
  */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
 	int err;
 
 	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-	if (unlikely(err < 0))
-		return nilfs_handle_bmap_error(err, __func__, inode,
-					       sci->sc_super);
+	if (err < 0)
+		return err;
 
 	err = nilfs_segctor_add_file_block(sci, bh, inode,
 					   sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
 				   struct buffer_head *bh,
 				   struct inode *inode)
 {
-	int err;
-
-	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-	if (unlikely(err < 0))
-		return nilfs_handle_bmap_error(err, __func__, inode,
-					       sci->sc_super);
-	return 0;
+	return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
 }
 
 static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
 	int err;
 
 	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-	if (unlikely(err < 0))
-		return nilfs_handle_bmap_error(err, __func__, inode,
-					       sci->sc_super);
+	if (err < 0)
+		return err;
 
 	err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
 	if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
 		ret++;
 	if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
 		ret++;
-	if (ret || nilfs_doing_gc())
-		if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
-			ret++;
+	if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
+		ret++;
 	return ret;
 }
 
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
 	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
 	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
-	nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+	nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
 	raw_sr->sr_flags = 0;
 
-	nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+	nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
 				 NILFS_SR_DAT_OFFSET(isz), 1);
 	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
 				 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		sci->sc_stage.scnt++;  /* Fall through */
 	case NILFS_ST_DAT:
  dat_stage:
-		err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
 					      &nilfs_sc_dat_ops);
 		if (unlikely(err))
 			break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
 	return 0;
 
  failed_bmap:
-	err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
 	return err;
 }
 
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 				if (!err) {
 					set_buffer_uptodate(bh);
 					clear_buffer_dirty(bh);
+					clear_buffer_delay(bh);
 					clear_buffer_nilfs_volatile(bh);
 				}
 				brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 				    b_assoc_buffers) {
 			set_buffer_uptodate(bh);
 			clear_buffer_dirty(bh);
+			clear_buffer_delay(bh);
 			clear_buffer_nilfs_volatile(bh);
 			clear_buffer_nilfs_redirected(bh);
 			if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e2dcc9c733f..0994f6a76c0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
 		 const char *fmt, ...)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
 	va_end(args);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
 void nilfs_warning(struct super_block *sb, const char *function,
 		   const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_WARNING "NILFS warning (device %s): %s: ",
-	       sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+	       sb->s_id, function, &vaf);
+
 	va_end(args);
 }
 
@@ -1010,11 +1018,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	unsigned long old_sb_flags;
-	struct nilfs_mount_options old_opts;
+	unsigned long old_mount_opt;
 	int err;
 
 	old_sb_flags = sb->s_flags;
-	old_opts.mount_opt = sbi->s_mount_opt;
+	old_mount_opt = sbi->s_mount_opt;
 
 	if (!parse_options(data, sb, 1)) {
 		err = -EINVAL;
@@ -1083,7 +1091,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
  restore_opts:
 	sb->s_flags = old_sb_flags;
-	sbi->s_mount_opt = old_opts.mount_opt;
+	sbi->s_mount_opt = old_mount_opt;
 	return err;
 }
 
@@ -1155,14 +1163,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
 		return ERR_CAST(sd.bdev);
 
@@ -1241,7 +1249,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode);
 
 	return root_dentry;
 
@@ -1250,7 +1258,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c..ad4ac607cf5 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	printk(KERN_INFO "NILFS: recovery complete.\n");
 
  skip_recovery:
-	set_nilfs_loaded(nilfs);
 	nilfs_clear_recovery_info(&ri);
 	sbi->s_super->s_flags = s_flags;
 	return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
-	struct inode *dat = nilfs_dat_inode(nilfs);
 	unsigned long ncleansegs;
 
-	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
 	return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b74..fd85e4c05c6 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
 /* the_nilfs struct */
 enum {
 	THE_NILFS_INIT = 0,     /* Information from super_block is set */
-	THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
-				   the latest checkpoint was loaded */
 	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
 	THE_NILFS_GC_RUNNING,	/* gc process is running */
 	THE_NILFS_SB_DIRTY,	/* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 }
 
 THE_NILFS_FNS(INIT, init)
-THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b..7dceff005a6 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
 	---help---
 	   Say Y here to enable fanotify suport.  fanotify is a file access
 	   notification system which differs from inotify in that it sends
-	   and open file descriptor to the userspace listener along with
+	   an open file descriptor to the userspace listener along with
 	   the event.
 
 	   If unsure, say Y.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be99254..4ff028fcfd6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a..f4b1057abdd 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
  * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
  * single-segment behaviour.
  *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
- * when atomic and when not atomic.  This is ok because
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
- * and it is ok to call this when non-atomic.
- * Infact, the only difference between __copy_from_user_inatomic() and
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
+ * atomic and when not atomic.  This is ok because it calls
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
+ * fact, the only difference between __copy_from_user_inatomic() and
  * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many
- * architectures __copy_from_user_inatomic() is just defined to
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * should not zero the tail of the buffer on error.  And on many architectures
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
+ * makes no difference at all on those architectures.
  */
 static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		if (unlikely(copied != len)) {
 			/* Do it the slow way. */
 			addr = kmap(*pages);
-			copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
-					*iov, *iov_ofs, len);
-			/*
-			 * Zero the rest of the target like __copy_from_user().
-			 */
-			memset(addr + ofs + copied, 0, len - copied);
-			kunmap(*pages);
+			copied = __ntfs_copy_from_user_iovec_inatomic(addr +
+					ofs, *iov, *iov_ofs, len);
 			if (unlikely(copied != len))
 				goto err_out;
+			kunmap(*pages);
 		}
 		total += len;
+		ntfs_set_next_iovec(iov, iov_ofs, len);
 		bytes -= len;
 		if (!bytes)
 			break;
-		ntfs_set_next_iovec(iov, iov_ofs, len);
 		ofs = 0;
 	} while (++pages < last_page);
 out:
 	return total;
 err_out:
-	total += copied;
+	BUG_ON(copied > len);
 	/* Zero the rest of the target like __copy_from_user(). */
+	memset(addr + ofs + copied, 0, len - copied);
+	kunmap(*pages);
+	total += copied;
+	ntfs_set_next_iovec(iov, iov_ofs, copied);
 	while (++pages < last_page) {
 		bytes -= len;
 		if (!bytes)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f..29099a07b9f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
  * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2001,2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
 	ntfs_sysctl(0);
 }
 
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698..77a8de5f711 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
 config OCFS2_FS
 	tristate "OCFS2 file system support"
-	depends on NET && SYSFS
-	select CONFIGFS_FS
+	depends on NET && SYSFS && CONFIGFS_FS
 	select JBD2
 	select CRC32
 	select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 
 config OCFS2_FS_STATS
 	bool "OCFS2 statistics"
-	depends on OCFS2_FS
+	depends on OCFS2_FS && DEBUG_FS
 	default y
 	help
 	  This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d..e4984e259cb 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
 	return ret;
 }
 
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 					 struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 
 	ocfs2_journal_dirty(handle, tl_bh);
 
+	osb->truncated_clusters += num_clusters;
 bail:
 	mlog_exit(status);
 	return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 		i--;
 	}
 
+	osb->truncated_clusters = 0;
+
 bail:
 	mlog_exit(status);
 	return status;
@@ -7139,64 +7141,6 @@ bail:
 }
 
 /*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-			   struct inode *inode,
-			   struct buffer_head *fe_bh,
-			   struct ocfs2_truncate_context **tc)
-{
-	int status;
-	unsigned int new_i_clusters;
-	struct ocfs2_dinode *fe;
-	struct ocfs2_extent_block *eb;
-	struct buffer_head *last_eb_bh = NULL;
-
-	mlog_entry_void();
-
-	*tc = NULL;
-
-	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-						  i_size_read(inode));
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
-	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-	     (unsigned long long)le64_to_cpu(fe->i_size));
-
-	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
-	if (!(*tc)) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-
-	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_read_extent_block(INODE_CACHE(inode),
-						 le64_to_cpu(fe->i_last_eb_blk),
-						 &last_eb_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-	}
-
-	(*tc)->tc_last_eb_bh = last_eb_bh;
-
-	status = 0;
-bail:
-	if (status < 0) {
-		if (*tc)
-			ocfs2_free_truncate_context(*tc);
-		*tc = NULL;
-	}
-	mlog_exit_void();
-	return status;
-}
-
-/*
  * 'start' is inclusive, 'end' is not.
  */
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
 out:
 	return ret;
 }
-
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
-	/*
-	 * The caller is responsible for completing deallocation
-	 * before freeing the context.
-	 */
-	if (tc->tc_dealloc.c_first_suballocator != NULL)
-		mlog(ML_NOTICE,
-		     "Truncate completion has non-empty dealloc context\n");
-
-	brelse(tc->tc_last_eb_bh);
-
-	kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b9..3bd08a03251 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
 
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 				  u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-			   struct inode *inode,
-			   struct buffer_head *fe_bh,
-			   struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct inode *inode,
 			  struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0d7c5540ad6..1fbb0e20131 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1630,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
 	return ret;
 }
 
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+					  unsigned int needed)
+{
+	tid_t target;
+	int ret = 0;
+	unsigned int truncated_clusters;
+
+	mutex_lock(&osb->osb_tl_inode->i_mutex);
+	truncated_clusters = osb->truncated_clusters;
+	mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+	/*
+	 * Check whether we can succeed in allocating if we free
+	 * the truncate log.
+	 */
+	if (truncated_clusters < needed)
+		goto out;
+
+	ret = ocfs2_flush_truncate_log(osb);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+		jbd2_log_wait_commit(osb->journal->j_journal, target);
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
 int ocfs2_write_begin_nolock(struct file *filp,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
@@ -1637,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
 	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-	unsigned int clusters_to_alloc, extents_to_split;
+	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1646,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
+	int try_free = 1, ret1;
 
+try_again:
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
 		mlog_errno(ret);
@@ -1681,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 		mlog_errno(ret);
 		goto out;
 	} else if (ret == 1) {
+		clusters_need = wc->w_clen;
 		ret = ocfs2_refcount_cow(inode, filp, di_bh,
 					 wc->w_cpos, wc->w_clen, UINT_MAX);
 		if (ret) {
@@ -1695,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 		mlog_errno(ret);
 		goto out;
 	}
+	clusters_need += clusters_to_alloc;
 
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
@@ -1817,6 +1858,22 @@ out:
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
+
+	if (ret == -ENOSPC && try_free) {
+		/*
+		 * Try to free some truncate log so that we can have enough
+		 * clusters to allocate.
+		 */
+		try_free = 0;
+
+		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+		if (ret1 == 1)
+			goto try_again;
+
+		if (ret1 < 0)
+			mlog_errno(ret1);
+	}
+
 	return ret;
 }
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9e3d45bcb5f..b108e863d8f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES	4
 #define O2HB_DB_TYPE_REGION_NUMBER	5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME	6
+#define O2HB_DB_TYPE_REGION_PINNED	7
 struct o2hb_debug_buf {
 	int db_type;
 	int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER	"num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME	"elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED	"pinned"
 
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
 
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF		3
+
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
 /* Only sets a new threshold if there are no active regions.
  *
  * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
 	struct config_item	hr_item;
 
 	struct list_head	hr_all_item;
-	unsigned		hr_unclean_stop:1;
+	unsigned		hr_unclean_stop:1,
+				hr_item_pinned:1,
+				hr_item_dropped:1;
 
 	/* protected by the hr_callback_sem */
 	struct task_struct 	*hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
 	struct dentry		*hr_debug_livenodes;
 	struct dentry		*hr_debug_regnum;
 	struct dentry		*hr_debug_elapsed_time;
+	struct dentry		*hr_debug_pinned;
 	struct o2hb_debug_buf	*hr_db_livenodes;
 	struct o2hb_debug_buf	*hr_db_regnum;
 	struct o2hb_debug_buf	*hr_db_elapsed_time;
+	struct o2hb_debug_buf	*hr_db_pinned;
 
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
@@ -701,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
 	       config_item_name(&reg->hr_item));
 
 	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+	/*
+	 * If global heartbeat active, unpin all regions if the
+	 * region count > CUT_OFF
+	 */
+	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+		o2hb_region_unpin(NULL);
 }
 
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1041,6 +1082,9 @@ static int o2hb_thread(void *data)
 
 	set_user_nice(current, -20);
 
+	/* Pin node */
+	o2nm_depend_this_node();
+
 	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
 		/* We track the time spent inside
 		 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1090,6 +1134,9 @@ static int o2hb_thread(void *data)
 		mlog_errno(ret);
 	}
 
+	/* Unpin node */
+	o2nm_undepend_this_node();
+
 	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
 
 	return 0;
@@ -1142,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
 						 reg->hr_last_timeout_start));
 		goto done;
 
+	case O2HB_DB_TYPE_REGION_PINNED:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+				!!reg->hr_item_pinned);
+		goto done;
+
 	default:
 		goto done;
 	}
@@ -1315,6 +1368,8 @@ int o2hb_init(void)
 	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
 	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
 
+	o2hb_dependent_users = 0;
+
 	return o2hb_debug_init();
 }
 
@@ -1384,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
 	debugfs_remove(reg->hr_debug_livenodes);
 	debugfs_remove(reg->hr_debug_regnum);
 	debugfs_remove(reg->hr_debug_elapsed_time);
+	debugfs_remove(reg->hr_debug_pinned);
 	debugfs_remove(reg->hr_debug_dir);
 
 	spin_lock(&o2hb_live_lock);
@@ -1673,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
@@ -1948,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
 		goto bail;
 	}
 
+	reg->hr_debug_pinned =
+			o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_pinned),
+					  sizeof(*(reg->hr_db_pinned)),
+					  O2HB_DB_TYPE_REGION_PINNED,
+					  0, 0, reg);
+	if (!reg->hr_debug_pinned) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
 	ret = 0;
 bail:
 	return ret;
@@ -2002,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
 	struct task_struct *hb_task;
 	struct o2hb_region *reg = to_o2hb_region(item);
+	int quorum_region = 0;
 
 	/* stop the thread when the user removes the region dir */
 	spin_lock(&o2hb_live_lock);
 	if (o2hb_global_heartbeat_active()) {
 		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
 		clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			quorum_region = 1;
+		clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
 	}
 	hb_task = reg->hr_task;
 	reg->hr_task = NULL;
+	reg->hr_item_dropped = 1;
 	spin_unlock(&o2hb_live_lock);
 
 	if (hb_task)
@@ -2028,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 	if (o2hb_global_heartbeat_active())
 		printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
 		       config_item_name(&reg->hr_item));
+
 	config_item_put(item);
+
+	if (!o2hb_global_heartbeat_active() || !quorum_region)
+		return;
+
+	/*
+	 * If global heartbeat active and there are dependent users,
+	 * pin all regions if quorum region count <= CUT_OFF
+	 */
+	spin_lock(&o2hb_live_lock);
+
+	if (!o2hb_dependent_users)
+		goto unlock;
+
+	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+		o2hb_region_pin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
 }
 
 struct o2hb_heartbeat_group_attribute {
@@ -2214,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
 
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-	struct o2hb_region *p, *reg = NULL;
+	int ret = 0, found = 0;
+	struct o2hb_region *reg;
+	char *uuid;
 
 	assert_spin_locked(&o2hb_live_lock);
 
-	list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
-		if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
-			reg = p;
-			break;
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		uuid = config_item_name(&reg->hr_item);
+
+		/* local heartbeat */
+		if (region_uuid) {
+			if (strcmp(region_uuid, uuid))
+				continue;
+			found = 1;
+		}
+
+		if (reg->hr_item_pinned || reg->hr_item_dropped)
+			goto skip_pin;
+
+		/* Ignore ENOENT only for local hb (userdlm domain) */
+		ret = o2nm_depend_item(&reg->hr_item);
+		if (!ret) {
+			mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+			reg->hr_item_pinned = 1;
+		} else {
+			if (ret == -ENOENT && found)
+				ret = 0;
+			else {
+				mlog(ML_ERROR, "Pin region %s fails with %d\n",
+				     uuid, ret);
+				break;
+			}
 		}
+skip_pin:
+		if (found)
+			break;
 	}
 
-	return reg;
+	return ret;
 }
 
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-	int ret = 0;
 	struct o2hb_region *reg;
+	char *uuid;
+	int found = 0;
 
-	spin_lock(&o2hb_live_lock);
+	assert_spin_locked(&o2hb_live_lock);
 
-	reg = o2hb_find_region(region_uuid);
-	if (!reg)
-		ret = -ENOENT;
-	spin_unlock(&o2hb_live_lock);
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		uuid = config_item_name(&reg->hr_item);
+		if (region_uuid) {
+			if (strcmp(region_uuid, uuid))
+				continue;
+			found = 1;
+		}
 
-	if (ret)
-		goto out;
+		if (reg->hr_item_pinned) {
+			mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+			o2nm_undepend_item(&reg->hr_item);
+			reg->hr_item_pinned = 0;
+		}
+		if (found)
+			break;
+	}
+}
 
-	ret = o2nm_depend_this_node();
-	if (ret)
-		goto out;
+static int o2hb_region_inc_user(const char *region_uuid)
+{
+	int ret = 0;
 
-	ret = o2nm_depend_item(&reg->hr_item);
-	if (ret)
-		o2nm_undepend_this_node();
+	spin_lock(&o2hb_live_lock);
 
-out:
+	/* local heartbeat */
+	if (!o2hb_global_heartbeat_active()) {
+	    ret = o2hb_region_pin(region_uuid);
+	    goto unlock;
+	}
+
+	/*
+	 * if global heartbeat active and this is the first dependent user,
+	 * pin all regions if quorum region count <= CUT_OFF
+	 */
+	o2hb_dependent_users++;
+	if (o2hb_dependent_users > 1)
+		goto unlock;
+
+	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+		ret = o2hb_region_pin(NULL);
+
+unlock:
+	spin_unlock(&o2hb_live_lock);
 	return ret;
 }
 
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-	struct o2hb_region *reg;
-
 	spin_lock(&o2hb_live_lock);
 
-	reg = o2hb_find_region(region_uuid);
+	/* local heartbeat */
+	if (!o2hb_global_heartbeat_active()) {
+	    o2hb_region_unpin(region_uuid);
+	    goto unlock;
+	}
 
-	spin_unlock(&o2hb_live_lock);
+	/*
+	 * if global heartbeat active and there are no dependent users,
+	 * unpin all quorum regions
+	 */
+	o2hb_dependent_users--;
+	if (!o2hb_dependent_users)
+		o2hb_region_unpin(NULL);
 
-	if (reg) {
-		o2nm_undepend_item(&reg->hr_item);
-		o2nm_undepend_this_node();
-	}
+unlock:
+	spin_unlock(&o2hb_live_lock);
 }
 
 int o2hb_register_callback(const char *region_uuid,
@@ -2291,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
 	}
 
 	if (region_uuid) {
-		ret = o2hb_region_get(region_uuid);
-		if (ret)
+		ret = o2hb_region_inc_user(region_uuid);
+		if (ret) {
+			mlog_errno(ret);
 			goto out;
+		}
 	}
 
 	down_write(&o2hb_callback_sem);
@@ -2311,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
 	up_write(&o2hb_callback_sem);
 	ret = 0;
 out:
-	mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+	mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
 	     ret, __builtin_return_address(0), hc);
 	return ret;
 }
@@ -2322,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
 	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
 
-	mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+	mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
 	     __builtin_return_address(0), hc);
 
 	/* XXX Can this happen _with_ a region reference? */
@@ -2330,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 		return;
 
 	if (region_uuid)
-		o2hb_region_put(region_uuid);
+		o2hb_region_dec_user(region_uuid);
 
 	down_write(&o2hb_callback_sem);
 
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b0..3a5835904b3 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
 #define O2NET_DEBUG_DIR		"o2net"
 #define SC_DEBUG_NAME		"sock_containers"
 #define NST_DEBUG_NAME		"send_tracking"
+#define STATS_DEBUG_NAME	"stats"
+
+#define SHOW_SOCK_CONTAINERS	0
+#define SHOW_SOCK_STATS		1
 
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
 
 static DEFINE_SPINLOCK(o2net_debug_lock);
 
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
 	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+	ktime_t now;
+	s64 sock, send, status;
 
 	spin_lock(&o2net_debug_lock);
 	nst = next_nst(dummy_nst);
+	if (!nst)
+		goto out;
 
-	if (nst != NULL) {
-		/* get_task_comm isn't exported.  oh well. */
-		seq_printf(seq, "%p:\n"
-			   "  pid:          %lu\n"
-			   "  tgid:         %lu\n"
-			   "  process name: %s\n"
-			   "  node:         %u\n"
-			   "  sc:           %p\n"
-			   "  message id:   %d\n"
-			   "  message type: %u\n"
-			   "  message key:  0x%08x\n"
-			   "  sock acquiry: %lu.%ld\n"
-			   "  send start:   %lu.%ld\n"
-			   "  wait start:   %lu.%ld\n",
-			   nst, (unsigned long)nst->st_task->pid,
-			   (unsigned long)nst->st_task->tgid,
-			   nst->st_task->comm, nst->st_node,
-			   nst->st_sc, nst->st_id, nst->st_msg_type,
-			   nst->st_msg_key,
-			   nst->st_sock_time.tv_sec,
-			   (long)nst->st_sock_time.tv_usec,
-			   nst->st_send_time.tv_sec,
-			   (long)nst->st_send_time.tv_usec,
-			   nst->st_status_time.tv_sec,
-			   (long)nst->st_status_time.tv_usec);
-	}
+	now = ktime_get();
+	sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+	send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+	status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+	/* get_task_comm isn't exported.  oh well. */
+	seq_printf(seq, "%p:\n"
+		   "  pid:          %lu\n"
+		   "  tgid:         %lu\n"
+		   "  process name: %s\n"
+		   "  node:         %u\n"
+		   "  sc:           %p\n"
+		   "  message id:   %d\n"
+		   "  message type: %u\n"
+		   "  message key:  0x%08x\n"
+		   "  sock acquiry: %lld usecs ago\n"
+		   "  send start:   %lld usecs ago\n"
+		   "  wait start:   %lld usecs ago\n",
+		   nst, (unsigned long)task_pid_nr(nst->st_task),
+		   (unsigned long)nst->st_task->tgid,
+		   nst->st_task->comm, nst->st_node,
+		   nst->st_sc, nst->st_id, nst->st_msg_type,
+		   nst->st_msg_key,
+		   (long long)sock,
+		   (long long)send,
+		   (long long)status);
 
+out:
 	spin_unlock(&o2net_debug_lock);
 
 	return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
 	spin_unlock(&o2net_debug_lock);
 }
 
+struct o2net_sock_debug {
+	int dbg_ctxt;
+	struct o2net_sock_container *dbg_sock;
+};
+
 static struct o2net_sock_container
 			*next_sc(struct o2net_sock_container *sc_start)
 {
@@ -253,7 +268,8 @@ static struct o2net_sock_container
 
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct o2net_sock_container *sc, *dummy_sc = seq->private;
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
 
 	spin_lock(&o2net_debug_lock);
 	sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct o2net_sock_container *sc, *dummy_sc = seq->private;
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
 
 	spin_lock(&o2net_debug_lock);
 	sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return sc; /* unused, just needs to be null when done */
 }
 
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)		((_s)->sc_send_count)
+# define sc_recv_count(_s)		((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)	(ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)		(0U)
+# define sc_recv_count(_s)		(0U)
+# define sc_tv_acquiry_total_ns(_s)	(0LL)
+# define sc_tv_send_total_ns(_s)	(0LL)
+# define sc_tv_status_total_ns(_s)	(0LL)
+# define sc_tv_process_total_ns(_s)	(0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION		1
+static void sc_show_sock_stats(struct seq_file *seq,
+			       struct o2net_sock_container *sc)
+{
+	if (!sc)
+		return;
+
+	seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+		   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+		   (long long)sc_tv_acquiry_total_ns(sc),
+		   (long long)sc_tv_send_total_ns(sc),
+		   (long long)sc_tv_status_total_ns(sc),
+		   (unsigned long)sc_recv_count(sc),
+		   (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+				   struct o2net_sock_container *sc)
+{
+	struct inet_sock *inet = NULL;
+	__be32 saddr = 0, daddr = 0;
+	__be16 sport = 0, dport = 0;
+
+	if (!sc)
+		return;
+
+	if (sc->sc_sock) {
+		inet = inet_sk(sc->sc_sock->sk);
+		/* the stack's structs aren't sparse endian clean */
+		saddr = (__force __be32)inet->inet_saddr;
+		daddr = (__force __be32)inet->inet_daddr;
+		sport = (__force __be16)inet->inet_sport;
+		dport = (__force __be16)inet->inet_dport;
+	}
+
+	/* XXX sigh, inet-> doesn't have sparse annotation so any
+	 * use of it here generates a warning with -Wbitwise */
+	seq_printf(seq, "%p:\n"
+		   "  krefs:           %d\n"
+		   "  sock:            %pI4:%u -> "
+				      "%pI4:%u\n"
+		   "  remote node:     %s\n"
+		   "  page off:        %zu\n"
+		   "  handshake ok:    %u\n"
+		   "  timer:           %lld usecs\n"
+		   "  data ready:      %lld usecs\n"
+		   "  advance start:   %lld usecs\n"
+		   "  advance stop:    %lld usecs\n"
+		   "  func start:      %lld usecs\n"
+		   "  func stop:       %lld usecs\n"
+		   "  func key:        0x%08x\n"
+		   "  func type:       %u\n",
+		   sc,
+		   atomic_read(&sc->sc_kref.refcount),
+		   &saddr, inet ? ntohs(sport) : 0,
+		   &daddr, inet ? ntohs(dport) : 0,
+		   sc->sc_node->nd_name,
+		   sc->sc_page_off,
+		   sc->sc_handshake_ok,
+		   (long long)ktime_to_us(sc->sc_tv_timer),
+		   (long long)ktime_to_us(sc->sc_tv_data_ready),
+		   (long long)ktime_to_us(sc->sc_tv_advance_start),
+		   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+		   (long long)ktime_to_us(sc->sc_tv_func_start),
+		   (long long)ktime_to_us(sc->sc_tv_func_stop),
+		   sc->sc_msg_key,
+		   sc->sc_msg_type);
+}
 
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-	struct o2net_sock_container *sc, *dummy_sc = seq->private;
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
 
 	spin_lock(&o2net_debug_lock);
 	sc = next_sc(dummy_sc);
 
-	if (sc != NULL) {
-		struct inet_sock *inet = NULL;
-
-		__be32 saddr = 0, daddr = 0;
-		__be16 sport = 0, dport = 0;
-
-		if (sc->sc_sock) {
-			inet = inet_sk(sc->sc_sock->sk);
-			/* the stack's structs aren't sparse endian clean */
-			saddr = (__force __be32)inet->inet_saddr;
-			daddr = (__force __be32)inet->inet_daddr;
-			sport = (__force __be16)inet->inet_sport;
-			dport = (__force __be16)inet->inet_dport;
-		}
-
-		/* XXX sigh, inet-> doesn't have sparse annotation so any
-		 * use of it here generates a warning with -Wbitwise */
-		seq_printf(seq, "%p:\n"
-			   "  krefs:           %d\n"
-			   "  sock:            %pI4:%u -> "
-					      "%pI4:%u\n"
-			   "  remote node:     %s\n"
-			   "  page off:        %zu\n"
-			   "  handshake ok:    %u\n"
-			   "  timer:           %lu.%ld\n"
-			   "  data ready:      %lu.%ld\n"
-			   "  advance start:   %lu.%ld\n"
-			   "  advance stop:    %lu.%ld\n"
-			   "  func start:      %lu.%ld\n"
-			   "  func stop:       %lu.%ld\n"
-			   "  func key:        %u\n"
-			   "  func type:       %u\n",
-			   sc,
-			   atomic_read(&sc->sc_kref.refcount),
-			   &saddr, inet ? ntohs(sport) : 0,
-			   &daddr, inet ? ntohs(dport) : 0,
-			   sc->sc_node->nd_name,
-			   sc->sc_page_off,
-			   sc->sc_handshake_ok,
-			   TV_SEC_USEC(sc->sc_tv_timer),
-			   TV_SEC_USEC(sc->sc_tv_data_ready),
-			   TV_SEC_USEC(sc->sc_tv_advance_start),
-			   TV_SEC_USEC(sc->sc_tv_advance_stop),
-			   TV_SEC_USEC(sc->sc_tv_func_start),
-			   TV_SEC_USEC(sc->sc_tv_func_stop),
-			   sc->sc_msg_key,
-			   sc->sc_msg_type);
+	if (sc) {
+		if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+			sc_show_sock_container(seq, sc);
+		else
+			sc_show_sock_stats(seq, sc);
 	}
 
-
 	spin_unlock(&o2net_debug_lock);
 
 	return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
 	.show = sc_seq_show,
 };
 
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 {
 	struct o2net_sock_container *dummy_sc;
 	struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
 		goto out;
 
 	seq = file->private_data;
-	seq->private = dummy_sc;
+	seq->private = sd;
+	sd->dbg_sock = dummy_sc;
 	o2net_debug_add_sc(dummy_sc);
 
 	dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
-	struct o2net_sock_container *dummy_sc = seq->private;
+	struct o2net_sock_debug *sd = seq->private;
+	struct o2net_sock_container *dummy_sc = sd->dbg_sock;
 
 	o2net_debug_del_sc(dummy_sc);
 	return seq_release_private(inode, file);
 }
 
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_sock_debug *sd;
+
+	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+	if (sd == NULL)
+		return -ENOMEM;
+
+	sd->dbg_ctxt = SHOW_SOCK_STATS;
+	sd->dbg_sock = NULL;
+
+	return sc_common_open(file, sd);
+}
+
+static const struct file_operations stats_seq_fops = {
+	.open = stats_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_sock_debug *sd;
+
+	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+	if (sd == NULL)
+		return -ENOMEM;
+
+	sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+	sd->dbg_sock = NULL;
+
+	return sc_common_open(file, sd);
+}
+
 static const struct file_operations sc_seq_fops = {
 	.open = sc_fop_open,
 	.read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
 		goto bail;
 	}
 
+	stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+					   o2net_dentry, NULL,
+					   &stats_seq_fops);
+	if (!stats_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
 	return 0;
 bail:
-	if (sc_dentry)
-		debugfs_remove(sc_dentry);
-	if (nst_dentry)
-		debugfs_remove(nst_dentry);
-	if (o2net_dentry)
-		debugfs_remove(o2net_dentry);
+	debugfs_remove(stats_dentry);
+	debugfs_remove(sc_dentry);
+	debugfs_remove(nst_dentry);
+	debugfs_remove(o2net_dentry);
 	return -ENOMEM;
 }
 
 void o2net_debugfs_exit(void)
 {
-	if (sc_dentry)
-		debugfs_remove(sc_dentry);
-	if (nst_dentry)
-		debugfs_remove(nst_dentry);
-	if (o2net_dentry)
-		debugfs_remove(o2net_dentry);
+	debugfs_remove(stats_dentry);
+	debugfs_remove(sc_dentry);
+	debugfs_remove(nst_dentry);
+	debugfs_remove(o2net_dentry);
 }
 
 #endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e4212..3b11cb1e38f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
 	nst->st_node = node;
 }
 
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-	do_gettimeofday(&nst->st_sock_time);
+	nst->st_sock_time = ktime_get();
 }
 
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-	do_gettimeofday(&nst->st_send_time);
+	nst->st_send_time = ktime_get();
 }
 
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-	do_gettimeofday(&nst->st_status_time);
+	nst->st_status_time = ktime_get();
 }
 
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-					 struct o2net_sock_container *sc)
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
 {
 	nst->st_sc = sc;
 }
 
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
 {
 	nst->st_id = msg_id;
 }
 
-#else  /* CONFIG_DEBUG_FS */
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-				  u32 msgkey, struct task_struct *task, u8 node)
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
 {
+	sc->sc_tv_timer = ktime_get();
 }
 
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
 {
+	sc->sc_tv_data_ready = ktime_get();
 }
 
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
 {
+	sc->sc_tv_advance_start = ktime_get();
 }
 
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
 {
+	sc->sc_tv_advance_stop = ktime_get();
 }
 
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-						struct o2net_sock_container *sc)
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
 {
+	sc->sc_tv_func_start = ktime_get();
 }
 
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
-					u32 msg_id)
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
 {
+	sc->sc_tv_func_stop = ktime_get();
 }
 
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a)		(ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
 
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+				    struct o2net_sock_container *sc)
+{
+	sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+					   ktime_sub(ktime_get(),
+						     nst->st_status_time));
+	sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+					 ktime_sub(nst->st_status_time,
+						   nst->st_send_time));
+	sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+					    ktime_sub(nst->st_send_time,
+						      nst->st_sock_time));
+	sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+	sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+					    o2net_get_func_run_time(sc));
+	sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
 static inline int o2net_reconnect_delay(void)
 {
 	return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
 		sc->sc_sock = NULL;
 	}
 
+	o2nm_undepend_item(&sc->sc_node->nd_item);
 	o2nm_node_put(sc->sc_node);
 	sc->sc_node = NULL;
 
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
 	struct o2net_sock_container *sc, *ret = NULL;
 	struct page *page = NULL;
+	int status = 0;
 
 	page = alloc_page(GFP_NOFS);
 	sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	o2nm_node_get(node);
 	sc->sc_node = node;
 
+	/* pin the node item of the remote node */
+	status = o2nm_depend_item(&node->nd_item);
+	if (status) {
+		mlog_errno(status);
+		o2nm_node_put(node);
+		goto out;
+	}
 	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
 	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
 	if (sk->sk_user_data) {
 		struct o2net_sock_container *sc = sk->sk_user_data;
 		sclog(sc, "data_ready hit\n");
-		do_gettimeofday(&sc->sc_tv_data_ready);
+		o2net_set_data_ready_time(sc);
 		o2net_sc_queue_work(sc, &sc->sc_rx_work);
 		ready = sc->sc_data_ready;
 	} else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 	o2net_set_nst_status_time(&nst);
 	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
 
+	o2net_update_send_stats(&nst, sc);
+
 	/* Note that we avoid overwriting the callers status return
 	 * variable if a system error was reported on the other
 	 * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
 	if (syserr != O2NET_ERR_NONE)
 		goto out_respond;
 
-	do_gettimeofday(&sc->sc_tv_func_start);
+	o2net_set_func_start_time(sc);
 	sc->sc_msg_key = be32_to_cpu(hdr->key);
 	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
 	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
 					     be16_to_cpu(hdr->data_len),
 					nmh->nh_func_data, &ret_data);
-	do_gettimeofday(&sc->sc_tv_func_stop);
+	o2net_set_func_stop_time(sc);
+
+	o2net_update_recv_stats(sc);
 
 out_respond:
 	/* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 	size_t datalen;
 
 	sclog(sc, "receiving\n");
-	do_gettimeofday(&sc->sc_tv_advance_start);
+	o2net_set_advance_start_time(sc);
 
 	if (unlikely(sc->sc_handshake_ok == 0)) {
 		if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 
 out:
 	sclog(sc, "ret = %d\n", ret);
-	do_gettimeofday(&sc->sc_tv_advance_stop);
+	o2net_set_advance_stop_time(sc);
 	return ret;
 }
 
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
 {
 	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-	struct timeval now;
 
-	do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+	ktime_t now = ktime_get();
+#endif
 
 	printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
 	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
 		     o2net_idle_timeout() / 1000,
 		     o2net_idle_timeout() % 1000);
-	mlog(ML_NOTICE, "here are some times that might help debug the "
-	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
-	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
-	     sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
-	     now.tv_sec, (long) now.tv_usec,
-	     sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
-	     sc->sc_tv_advance_start.tv_sec,
-	     (long) sc->sc_tv_advance_start.tv_usec,
-	     sc->sc_tv_advance_stop.tv_sec,
-	     (long) sc->sc_tv_advance_stop.tv_usec,
+
+#ifdef CONFIG_DEBUG_FS
+	mlog(ML_NOTICE, "Here are some times that might help debug the "
+	     "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
+	     "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
+	     (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
+	     (long long)ktime_to_us(sc->sc_tv_data_ready),
+	     (long long)ktime_to_us(sc->sc_tv_advance_start),
+	     (long long)ktime_to_us(sc->sc_tv_advance_stop),
 	     sc->sc_msg_key, sc->sc_msg_type,
-	     sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
-	     sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+	     (long long)ktime_to_us(sc->sc_tv_func_start),
+	     (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
 
 	/*
 	 * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
 	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
 		      msecs_to_jiffies(o2net_keepalive_delay()));
-	do_gettimeofday(&sc->sc_tv_timer);
+	o2net_set_sock_timer(sc);
 	mod_timer(&sc->sc_idle_timeout,
 	       jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4..4cbcb65784a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
 	/* original handlers for the sockets */
 	void			(*sc_state_change)(struct sock *sk);
 	void			(*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
-	struct list_head        sc_net_debug_item;
-#endif
-	struct timeval 		sc_tv_timer;
-	struct timeval 		sc_tv_data_ready;
-	struct timeval 		sc_tv_advance_start;
-	struct timeval 		sc_tv_advance_stop;
-	struct timeval 		sc_tv_func_start;
-	struct timeval 		sc_tv_func_stop;
+
 	u32			sc_msg_key;
 	u16			sc_msg_type;
 
+#ifdef CONFIG_DEBUG_FS
+	struct list_head        sc_net_debug_item;
+	ktime_t			sc_tv_timer;
+	ktime_t			sc_tv_data_ready;
+	ktime_t			sc_tv_advance_start;
+	ktime_t			sc_tv_advance_stop;
+	ktime_t			sc_tv_func_start;
+	ktime_t			sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+	ktime_t			sc_tv_acquiry_total;
+	ktime_t			sc_tv_send_total;
+	ktime_t			sc_tv_status_total;
+	u32			sc_send_count;
+	u32			sc_recv_count;
+	ktime_t			sc_tv_process_total;
+#endif
 	struct mutex		sc_send_lock;
 };
 
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
 	u32				st_msg_type;
 	u32				st_msg_key;
 	u8				st_node;
-	struct timeval			st_sock_time;
-	struct timeval			st_send_time;
-	struct timeval			st_status_time;
+	ktime_t				st_sock_time;
+	ktime_t				st_send_time;
+	ktime_t				st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f4499915683..3a3ed4bb794 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-	mlog_entry_void();
+	struct dlm_lock_resource *res;
 
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
 
+	res = lock->lockres;
+
 	assert_spin_locked(&dlm->ast_lock);
+
 	if (!list_empty(&lock->ast_list)) {
-		mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+		mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+		     "AST list not empty, pending %d, newlevel %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
 		     lock->ast_pending, lock->ml.type);
 		BUG();
 	}
 	if (lock->ast_pending)
-		mlog(0, "lock has an ast getting flushed right now\n");
+		mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
 
 	/* putting lock on list, add a ref */
 	dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 	/* check to see if this ast obsoletes the bast */
 	if (dlm_should_cancel_bast(dlm, lock)) {
-		struct dlm_lock_resource *res = lock->lockres;
-		mlog(0, "%s: cancelling bast for %.*s\n",
-		     dlm->name, res->lockname.len, res->lockname.name);
+		mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
 		lock->bast_pending = 0;
 		list_del_init(&lock->bast_list);
 		lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-	mlog_entry_void();
-
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
 
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-	mlog_entry_void();
+	struct dlm_lock_resource *res;
 
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
+
 	assert_spin_locked(&dlm->ast_lock);
 
+	res = lock->lockres;
+
 	BUG_ON(!list_empty(&lock->bast_list));
 	if (lock->bast_pending)
-		mlog(0, "lock has a bast getting flushed right now\n");
+		mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
 
 	/* putting lock on list, add a ref */
 	dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-	mlog_entry_void();
-
 	BUG_ON(!dlm);
 	BUG_ON(!lock);
 
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	dlm_astlockfunc_t *fn;
 	struct dlm_lockstatus *lksb;
 
-	mlog_entry_void();
+	mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+	     res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
 
 	lksb = lock->lksb;
 	fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	struct dlm_lockstatus *lksb;
 	int lksbflags;
 
-	mlog_entry_void();
+	mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+	     res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
 
 	lksb = lock->lksb;
 	BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
 	dlm_bastlockfunc_t *fn = lock->bast;
 
-	mlog_entry_void();
 	BUG_ON(lock->ml.node != dlm->node_num);
 
+	mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+	     dlm->name, res->lockname.len, res->lockname.name,
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	     blocked_type);
+
 	(*fn)(lock->astdata, blocked_type);
 }
 
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	/* cannot get a proxy ast message if this node owns it */
 	BUG_ON(res->owner == dlm->node_num);
 
-	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
 
 	spin_lock(&res->spinlock);
 	if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
 		list_move_tail(&lock->list, &res->granted);
-		mlog(0, "ast: Adding to granted list... type=%d, "
-		     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     lock->ml.type, lock->ml.convert_type);
+
 		if (lock->ml.convert_type != LKM_IVMODE) {
 			lock->ml.type = lock->ml.convert_type;
 			lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	size_t veclen = 1;
 	int status;
 
-	mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
-		   res->lockname.len, res->lockname.name, lock->ml.node,
-		   msg_type, blocked_type);
+	mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+	     blocked_type);
 
 	memset(&past, 0, sizeof(struct dlm_proxy_ast));
 	past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
 	vec[0].iov_base = &past;
 	if (flags & DLM_LKSB_GET_LVB) {
-		mlog(0, "returning requested LVB data\n");
 		be32_add_cpu(&past.flags, LKM_GET_LVB);
 		vec[1].iov_len = DLM_LVB_LEN;
 		vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
 				     lock->ml.node, &status);
 	if (ret < 0)
-		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
-		     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+		mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+		     dlm->name, res->lockname.len, res->lockname.name, ret,
 		     lock->ml.node);
 	else {
 		if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5..4bdf7baee34 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 
 enum dlm_mle_type {
-	DLM_MLE_BLOCK,
-	DLM_MLE_MASTER,
-	DLM_MLE_MIGRATION,
-	DLM_MLE_NUM_TYPES
+	DLM_MLE_BLOCK = 0,
+	DLM_MLE_MASTER = 1,
+	DLM_MLE_MIGRATION = 2,
+	DLM_MLE_NUM_TYPES = 3,
 };
 
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 
 enum dlm_ast_type {
 	DLM_AST = 0,
-	DLM_BAST,
-	DLM_ASTUNLOCK
+	DLM_BAST = 1,
+	DLM_ASTUNLOCK = 2,
 };
 
 
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 
 enum dlm_ctxt_state {
 	DLM_CTXT_NEW = 0,
-	DLM_CTXT_JOINED,
-	DLM_CTXT_IN_SHUTDOWN,
-	DLM_CTXT_LEAVING,
+	DLM_CTXT_JOINED = 1,
+	DLM_CTXT_IN_SHUTDOWN = 2,
+	DLM_CTXT_LEAVING = 3,
 };
 
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 
 enum dlm_lockres_list {
 	DLM_GRANTED_LIST = 0,
-	DLM_CONVERTING_LIST,
-	DLM_BLOCKED_LIST
+	DLM_CONVERTING_LIST = 1,
+	DLM_BLOCKED_LIST = 2,
 };
 
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
 
 
 enum {
-	DLM_MASTER_REQUEST_MSG    = 500,
-	DLM_UNUSED_MSG1,         /* 501 */
-	DLM_ASSERT_MASTER_MSG,	 /* 502 */
-	DLM_CREATE_LOCK_MSG,	 /* 503 */
-	DLM_CONVERT_LOCK_MSG,	 /* 504 */
-	DLM_PROXY_AST_MSG,	 /* 505 */
-	DLM_UNLOCK_LOCK_MSG,	 /* 506 */
-	DLM_DEREF_LOCKRES_MSG,	 /* 507 */
-	DLM_MIGRATE_REQUEST_MSG, /* 508 */
-	DLM_MIG_LOCKRES_MSG, 	 /* 509 */
-	DLM_QUERY_JOIN_MSG,	 /* 510 */
-	DLM_ASSERT_JOINED_MSG,	 /* 511 */
-	DLM_CANCEL_JOIN_MSG,	 /* 512 */
-	DLM_EXIT_DOMAIN_MSG,	 /* 513 */
-	DLM_MASTER_REQUERY_MSG,	 /* 514 */
-	DLM_LOCK_REQUEST_MSG,	 /* 515 */
-	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
-	DLM_BEGIN_RECO_MSG,	 /* 517 */
-	DLM_FINALIZE_RECO_MSG,	 /* 518 */
-	DLM_QUERY_REGION,	 /* 519 */
-	DLM_QUERY_NODEINFO,	 /* 520 */
+	DLM_MASTER_REQUEST_MSG		= 500,
+	DLM_UNUSED_MSG1			= 501,
+	DLM_ASSERT_MASTER_MSG		= 502,
+	DLM_CREATE_LOCK_MSG		= 503,
+	DLM_CONVERT_LOCK_MSG		= 504,
+	DLM_PROXY_AST_MSG		= 505,
+	DLM_UNLOCK_LOCK_MSG		= 506,
+	DLM_DEREF_LOCKRES_MSG		= 507,
+	DLM_MIGRATE_REQUEST_MSG		= 508,
+	DLM_MIG_LOCKRES_MSG		= 509,
+	DLM_QUERY_JOIN_MSG		= 510,
+	DLM_ASSERT_JOINED_MSG		= 511,
+	DLM_CANCEL_JOIN_MSG		= 512,
+	DLM_EXIT_DOMAIN_MSG		= 513,
+	DLM_MASTER_REQUERY_MSG		= 514,
+	DLM_LOCK_REQUEST_MSG		= 515,
+	DLM_RECO_DATA_DONE_MSG		= 516,
+	DLM_BEGIN_RECO_MSG		= 517,
+	DLM_FINALIZE_RECO_MSG		= 518,
+	DLM_QUERY_REGION		= 519,
+	DLM_QUERY_NODEINFO		= 520,
 };
 
 struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
 enum {
 	DLM_RECO_NODE_DATA_DEAD = -1,
 	DLM_RECO_NODE_DATA_INIT = 0,
-	DLM_RECO_NODE_DATA_REQUESTING,
-	DLM_RECO_NODE_DATA_REQUESTED,
-	DLM_RECO_NODE_DATA_RECEIVING,
-	DLM_RECO_NODE_DATA_DONE,
-	DLM_RECO_NODE_DATA_FINALIZE_SENT,
+	DLM_RECO_NODE_DATA_REQUESTING = 1,
+	DLM_RECO_NODE_DATA_REQUESTED = 2,
+	DLM_RECO_NODE_DATA_RECEIVING = 3,
+	DLM_RECO_NODE_DATA_DONE = 4,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
 };
 
 
 enum {
 	DLM_MASTER_RESP_NO = 0,
-	DLM_MASTER_RESP_YES,
-	DLM_MASTER_RESP_MAYBE,
-	DLM_MASTER_RESP_ERROR
+	DLM_MASTER_RESP_YES = 1,
+	DLM_MASTER_RESP_MAYBE = 2,
+	DLM_MASTER_RESP_ERROR = 3,
 };
 
 
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
 	JOIN_DISALLOW = 0,
-	JOIN_OK,
-	JOIN_OK_NO_MAP,
-	JOIN_PROTOCOL_MISMATCH,
+	JOIN_OK = 1,
+	JOIN_OK_NO_MAP = 2,
+	JOIN_PROTOCOL_MISMATCH = 3,
 };
 
 struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a5..04a32be0aeb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
 	kref_get(&dc->debug_refcnt);
 }
 
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
 {
-	struct debug_buffer *db = NULL;
-
-	db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
-	if (!db)
-		goto bail;
-
-	db->len = PAGE_SIZE;
-	db->buf = kmalloc(db->len, GFP_KERNEL);
-	if (!db->buf)
-		goto bail;
-
-	return db;
-bail:
-	kfree(db);
-	return NULL;
-}
-
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
-				 size_t nbytes, loff_t *ppos)
-{
-	struct debug_buffer *db = file->private_data;
-
-	return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
-	struct debug_buffer *db = file->private_data;
-	loff_t new = -1;
-
-	switch (whence) {
-	case 0:
-		new = off;
-		break;
-	case 1:
-		new = file->f_pos + off;
-		break;
-	}
-
-	if (new < 0 || new > db->len)
-		return -EINVAL;
-
-	return (file->f_pos = new);
+	free_page((unsigned long)file->private_data);
+	return 0;
 }
 
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+			  size_t nbytes, loff_t *ppos)
 {
-	struct debug_buffer *db = file->private_data;
-
-	if (db)
-		kfree(db->buf);
-	kfree(db);
-
-	return 0;
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
 }
 /* end - util funcs */
 
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
 	struct dlm_lock_resource *res;
 	int out = 0;
 	unsigned long total = 0;
 
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Dumping Purgelist for Domain: %s\n", dlm->name);
 
 	spin_lock(&dlm->spinlock);
 	list_for_each_entry(res, &dlm->purge_list, purge) {
 		++total;
-		if (db->len - out < 100)
+		if (len - out < 100)
 			continue;
 		spin_lock(&res->spinlock);
 		out += stringify_lockname(res->lockname.name,
 					  res->lockname.len,
-					  db->buf + out, db->len - out);
-		out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+					  buf + out, len - out);
+		out += snprintf(buf + out, len - out, "\t%ld\n",
 				(jiffies - res->last_used)/HZ);
 		spin_unlock(&res->spinlock);
 	}
 	spin_unlock(&dlm->spinlock);
 
-	out += snprintf(db->buf + out, db->len - out,
-			"Total on list: %ld\n", total);
+	out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
 
 	return out;
 }
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	struct debug_buffer *db;
+	char *buf = NULL;
 
-	db = debug_buffer_allocate();
-	if (!db)
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
 		goto bail;
 
-	db->len = debug_purgelist_print(dlm, db);
+	i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
 
-	file->private_data = db;
+	file->private_data = buf;
 
 	return 0;
 bail:
@@ -480,14 +434,14 @@ bail:
 
 static const struct file_operations debug_purgelist_fops = {
 	.open =		debug_purgelist_open,
-	.release =	debug_buffer_release,
-	.read =		debug_buffer_read,
-	.llseek =	debug_buffer_llseek,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
 };
 /* end - purge list funcs */
 
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
 	struct dlm_master_list_entry *mle;
 	struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 	int i, out = 0;
 	unsigned long total = 0, longest = 0, bucket_count = 0;
 
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Dumping MLEs for Domain: %s\n", dlm->name);
 
 	spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 					  master_hash_node);
 			++total;
 			++bucket_count;
-			if (db->len - out < 200)
+			if (len - out < 200)
 				continue;
-			out += dump_mle(mle, db->buf + out, db->len - out);
+			out += dump_mle(mle, buf + out, len - out);
 		}
 		longest = max(longest, bucket_count);
 		bucket_count = 0;
 	}
 	spin_unlock(&dlm->master_lock);
 
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Total: %ld, Longest: %ld\n", total, longest);
 	return out;
 }
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	struct debug_buffer *db;
+	char *buf = NULL;
 
-	db = debug_buffer_allocate();
-	if (!db)
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
 		goto bail;
 
-	db->len = debug_mle_print(dlm, db);
+	i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
 
-	file->private_data = db;
+	file->private_data = buf;
 
 	return 0;
 bail:
@@ -540,9 +494,9 @@ bail:
 
 static const struct file_operations debug_mle_fops = {
 	.open =		debug_mle_open,
-	.release =	debug_buffer_release,
-	.read =		debug_buffer_read,
-	.llseek =	debug_buffer_llseek,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
 };
 
 /* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
 	int out = 0;
 	struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 	}
 
 	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
 			dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
 			dlm->dlm_locking_proto.pv_minor);
 
 	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Thread Pid: %d  Node: %d  State: %s\n",
-			dlm->dlm_thread_task->pid, dlm->node_num, state);
+			task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
 
 	/* Number of Joins: xxx  Joining Node: xxx */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Number of Joins: %d  Joining Node: %d\n",
 			dlm->num_joins, dlm->joining_node);
 
 	/* Domain Map: xx xx xx */
-	out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+	out += snprintf(buf + out, len - out, "Domain Map: ");
 	out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-				 db->buf + out, db->len - out);
-	out += snprintf(db->buf + out, db->len - out, "\n");
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
 
 	/* Live Map: xx xx xx */
-	out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+	out += snprintf(buf + out, len - out, "Live Map: ");
 	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-				 db->buf + out, db->len - out);
-	out += snprintf(db->buf + out, db->len - out, "\n");
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
 
 	/* Lock Resources: xxx (xxx) */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Lock Resources: %d (%d)\n",
 			atomic_read(&dlm->res_cur_count),
 			atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 		cur_mles += atomic_read(&dlm->mle_cur_count[i]);
 
 	/* MLEs: xxx (xxx) */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"MLEs: %d (%d)\n", cur_mles, tot_mles);
 
 	/*  Blocking: xxx (xxx) */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"  Blocking: %d (%d)\n",
 			atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
 			atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
 
 	/*  Mastery: xxx (xxx) */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"  Mastery: %d (%d)\n",
 			atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
 			atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
 
 	/*  Migration: xxx (xxx) */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"  Migration: %d (%d)\n",
 			atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
 			atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
 
 	/* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
 			"PendingBASTs=%s\n",
 			(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
 
 	/* Purge Count: xxx  Refs: xxx */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
 			atomic_read(&dlm->dlm_refs.refcount));
 
 	/* Dead Node: xxx */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Dead Node: %d\n", dlm->reco.dead_node);
 
 	/* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 		state = "INACTIVE";
 
 	/* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-	out += snprintf(db->buf + out, db->len - out,
+	out += snprintf(buf + out, len - out,
 			"Recovery Pid: %d  Master: %d  State: %s\n",
-			dlm->dlm_reco_thread_task->pid,
+			task_pid_nr(dlm->dlm_reco_thread_task),
 			dlm->reco.new_master, state);
 
 	/* Recovery Map: xx xx */
-	out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+	out += snprintf(buf + out, len - out, "Recovery Map: ");
 	out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-				 db->buf + out, db->len - out);
-	out += snprintf(db->buf + out, db->len - out, "\n");
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
 
 	/* Recovery Node State: */
-	out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+	out += snprintf(buf + out, len - out, "Recovery Node State:\n");
 	list_for_each_entry(node, &dlm->reco.node_data, list) {
 		switch (node->state) {
 		case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 			state = "BAD";
 			break;
 		}
-		out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+		out += snprintf(buf + out, len - out, "\t%u - %s\n",
 				node->node_num, state);
 	}
 
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	struct debug_buffer *db = NULL;
+	char *buf = NULL;
 
-	db = debug_buffer_allocate();
-	if (!db)
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (!buf)
 		goto bail;
 
-	db->len = debug_state_print(dlm, db);
+	i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
 
-	file->private_data = db;
+	file->private_data = buf;
 
 	return 0;
 bail:
@@ -936,9 +890,9 @@ bail:
 
 static const struct file_operations debug_state_fops = {
 	.open =		debug_state_open,
-	.release =	debug_buffer_release,
-	.read =		debug_buffer_read,
-	.llseek =	debug_buffer_llseek,
+	.release =	debug_release,
+	.read =		debug_read,
+	.llseek =	generic_file_llseek,
 };
 /* end  - debug state funcs */
 
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
 
 	if (dc) {
-		if (dc->debug_purgelist_dentry)
-			debugfs_remove(dc->debug_purgelist_dentry);
-		if (dc->debug_mle_dentry)
-			debugfs_remove(dc->debug_mle_dentry);
-		if (dc->debug_lockres_dentry)
-			debugfs_remove(dc->debug_lockres_dentry);
-		if (dc->debug_state_dentry)
-			debugfs_remove(dc->debug_state_dentry);
+		debugfs_remove(dc->debug_purgelist_dentry);
+		debugfs_remove(dc->debug_mle_dentry);
+		debugfs_remove(dc->debug_lockres_dentry);
+		debugfs_remove(dc->debug_state_dentry);
 		dlm_debug_put(dc);
 	}
 }
@@ -1040,8 +990,7 @@ bail:
 
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-	if (dlm->dlm_debugfs_subroot)
-		debugfs_remove(dlm->dlm_debugfs_subroot);
+	debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 
 /* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
 
 void dlm_destroy_debugfs_root(void)
 {
-	if (dlm_debugfs_root)
-		debugfs_remove(dlm_debugfs_root);
+	debugfs_remove(dlm_debugfs_root);
 }
 #endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c..1f27c4812d1 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
 	struct dentry *debug_purgelist_dentry;
 };
 
-struct debug_buffer {
-	int len;
-	char *buf;
-};
-
 struct debug_lockres {
 	int dl_len;
 	char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index cc2aaa96cfe..7e38a072d72 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
 		}
 		cond_resched_lock(&dlm->spinlock);
 		num += n;
-		mlog(0, "%s: touched %d lockreses in bucket %d "
-		     "(tot=%d)\n", dlm->name, n, i, num);
 	}
 	spin_unlock(&dlm->spinlock);
 	wake_up(&dlm->dlm_thread_wq);
@@ -1661,8 +1659,8 @@ bail:
 
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-	o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
-	o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+	o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
 	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
 
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 
 	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
 			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-	status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
 	if (status)
 		goto bail;
 
 	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
 			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-	status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
 	if (status)
 		goto bail;
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c..7009292aac5 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
 
 		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
 			return 0;
+		if (!dlm_lock_compatible(tmplock->ml.convert_type,
+					 lock->ml.type))
+			return 0;
 	}
 
 	return 1;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9..1d6d1d22c47 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			      struct dlm_lock_resource *res)
 {
-	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
-
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&res->spinlock);
 
 	if (__dlm_lockres_unused(res)){
 		if (list_empty(&res->purge)) {
-			mlog(0, "putting lockres %.*s:%p onto purge list\n",
-			     res->lockname.len, res->lockname.name, res);
+			mlog(0, "%s: Adding res %.*s to purge list\n",
+			     dlm->name, res->lockname.len, res->lockname.name);
 
 			res->last_used = jiffies;
 			dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			dlm->purge_count++;
 		}
 	} else if (!list_empty(&res->purge)) {
-		mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
-		     res->lockname.len, res->lockname.name, res, res->owner);
+		mlog(0, "%s: Removing res %.*s from purge list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
 
 		list_del_init(&res->purge);
 		dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			    struct dlm_lock_resource *res)
 {
-	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
 	spin_lock(&dlm->spinlock);
 	spin_lock(&res->spinlock);
 
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 	master = (res->owner == dlm->node_num);
 
-
-	mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
-	     res->lockname.name, master);
+	mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+	     res->lockname.len, res->lockname.name, master);
 
 	if (!master) {
 		res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 		/* clear our bit from the master's refmap, ignore errors */
 		ret = dlm_drop_lockres_ref(dlm, res);
 		if (ret < 0) {
-			mlog_errno(ret);
+			mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+			     res->lockname.len, res->lockname.name, ret);
 			if (!dlm_is_host_down(ret))
 				BUG();
 		}
-		mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
-		     dlm->name, res->lockname.len, res->lockname.name, ret);
 		spin_lock(&dlm->spinlock);
 		spin_lock(&res->spinlock);
 	}
 
 	if (!list_empty(&res->purge)) {
-		mlog(0, "removing lockres %.*s:%p from purgelist, "
-		     "master = %d\n", res->lockname.len, res->lockname.name,
-		     res, master);
+		mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+		     dlm->name, res->lockname.len, res->lockname.name, master);
 		list_del_init(&res->purge);
 		dlm_lockres_put(res);
 		dlm->purge_count--;
 	}
 
 	if (!__dlm_lockres_unused(res)) {
-		mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+		mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
 		     dlm->name, res->lockname.len, res->lockname.name);
 		__dlm_print_one_lock_resource(res);
 		BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 		unused = __dlm_lockres_unused(lockres);
 		if (!unused ||
 		    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-			mlog(0, "lockres %s:%.*s: is in use or "
-			     "being remastered, used %d, state %d\n",
-			     dlm->name, lockres->lockname.len,
-			     lockres->lockname.name, !unused, lockres->state);
+			mlog(0, "%s: res %.*s is in use or being remastered, "
+			     "used %d, state %d\n", dlm->name,
+			     lockres->lockname.len, lockres->lockname.name,
+			     !unused, lockres->state);
 			list_move_tail(&dlm->purge_list, &lockres->purge);
 			spin_unlock(&lockres->spinlock);
 			continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 	struct list_head *head;
 	int can_grant = 1;
 
-	//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
-	//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
-	//mlog(0, "shuffle res %.*s\n", res->lockname.len,
-	//	  res->lockname.name);
-
-	/* because this function is called with the lockres
+	/*
+	 * Because this function is called with the lockres
 	 * spinlock, and because we know that it is not migrating/
 	 * recovering/in-progress, it is fine to reserve asts and
-	 * basts right before queueing them all throughout */
+	 * basts right before queueing them all throughout
+	 */
 	assert_spin_locked(&dlm->ast_lock);
 	assert_spin_locked(&res->spinlock);
 	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
 	if (list_empty(&res->converting))
 		goto blocked;
-	mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
-	     res->lockname.name);
+	mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+	     res->lockname.len, res->lockname.name);
 
 	target = list_entry(res->converting.next, struct dlm_lock, list);
 	if (target->ml.convert_type == LKM_IVMODE) {
-		mlog(ML_ERROR, "%.*s: converting a lock with no "
-		     "convert_type!\n", res->lockname.len, res->lockname.name);
+		mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
 		BUG();
 	}
 	head = &res->granted;
@@ -365,9 +356,12 @@ converting:
 		spin_lock(&target->spinlock);
 		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
 
-		mlog(0, "calling ast for converting lock: %.*s, have: %d, "
-		     "granting: %d, node: %u\n", res->lockname.len,
-		     res->lockname.name, target->ml.type,
+		mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+		     "%d => %d, node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+		     target->ml.type,
 		     target->ml.convert_type, target->ml.node);
 
 		target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
 		spin_lock(&target->spinlock);
 		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
 
-		mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
-		     "node: %u\n", res->lockname.len, res->lockname.name,
+		mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+		     "node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
 		     target->ml.type, target->ml.node);
 
-		// target->ml.type is already correct
+		/* target->ml.type is already correct */
 		list_move_tail(&target->list, &res->granted);
 
 		BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-	mlog_entry("dlm=%p, res=%p\n", dlm, res);
 	if (res) {
 		spin_lock(&dlm->spinlock);
 		spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-	mlog_entry("dlm=%p, res=%p\n", dlm, res);
-
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&res->spinlock);
 
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 			res->state |= DLM_LOCK_RES_DIRTY;
 		}
 	}
+
+	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+	     res->lockname.name);
 }
 
 
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-	mlog(0, "starting dlm thread...\n");
+	mlog(0, "Starting dlm_thread...\n");
 
 	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
 	if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
 	if (dlm->dlm_thread_task) {
-		mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+		mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
 		kthread_stop(dlm->dlm_thread_task);
 		dlm->dlm_thread_task = NULL;
 	}
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		/* get an extra ref on lock */
 		dlm_lock_get(lock);
 		res = lock->lockres;
-		mlog(0, "delivering an ast for this lockres\n");
+		mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+		     "node %u\n", dlm->name, res->lockname.len,
+		     res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     lock->ml.type, lock->ml.node);
 
 		BUG_ON(!lock->ast_pending);
 
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		/* possible that another ast was queued while
 		 * we were delivering the last one */
 		if (!list_empty(&lock->ast_list)) {
-			mlog(0, "aha another ast got queued while "
-			     "we were finishing the last one.  will "
-			     "keep the ast_pending flag set.\n");
+			mlog(0, "%s: res %.*s, AST queued while flushing last "
+			     "one\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
 		} else
 			lock->ast_pending = 0;
 
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		dlm_lock_put(lock);
 		spin_unlock(&dlm->ast_lock);
 
-		mlog(0, "delivering a bast for this lockres "
-		     "(blocked = %d\n", hi);
+		mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+		     "blocked %d, node %u\n",
+		     dlm->name, res->lockname.len, res->lockname.name,
+		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		     hi, lock->ml.node);
 
 		if (lock->ml.node != dlm->node_num) {
 			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
 		/* possible that another bast was queued while
 		 * we were delivering the last one */
 		if (!list_empty(&lock->bast_list)) {
-			mlog(0, "aha another bast got queued while "
-			     "we were finishing the last one.  will "
-			     "keep the bast_pending flag set.\n");
+			mlog(0, "%s: res %.*s, BAST queued while flushing last "
+			     "one\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
 		} else
 			lock->bast_pending = 0;
 
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
 			spin_lock(&res->spinlock);
 			if (res->owner != dlm->node_num) {
 				__dlm_print_one_lock_resource(res);
-				mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
-				     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
-				     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
-				     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
-				     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+				mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+				     " dirty %d\n", dlm->name,
+				     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+				     !!(res->state & DLM_LOCK_RES_MIGRATING),
+				     !!(res->state & DLM_LOCK_RES_RECOVERING),
+				     !!(res->state & DLM_LOCK_RES_DIRTY));
 			}
 			BUG_ON(res->owner != dlm->node_num);
 
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
 				res->state &= ~DLM_LOCK_RES_DIRTY;
 				spin_unlock(&res->spinlock);
 				spin_unlock(&dlm->ast_lock);
-				mlog(0, "delaying list shuffling for in-"
-				     "progress lockres %.*s, state=%d\n",
+				mlog(0, "%s: res %.*s, inprogress, delay list "
+				     "shuffle, state %d\n", dlm->name,
 				     res->lockname.len, res->lockname.name,
 				     res->state);
 				delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
 			 * spinlock and do NOT have the dlm lock.
 			 * safe to reserve/queue asts and run the lists. */
 
-			mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-			     "res=%.*s\n", dlm->name,
-			     res->lockname.len, res->lockname.name);
-
 			/* called while holding lockres lock */
 			dlm_shuffle_lists(dlm, res);
 			res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
 			/* unlikely, but we may need to give time to
 			 * other tasks */
 			if (!--n) {
-				mlog(0, "throttling dlm_thread\n");
+				mlog(0, "%s: Throttling dlm thread\n",
+				     dlm->name);
 				break;
 			}
 		}
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 6adafa57606..5dbc3062b4f 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
 	}
 
 	result = d_obtain_alias(inode);
-	if (!IS_ERR(result))
-		d_set_d_op(result, &ocfs2_dentry_ops);
-	else
+	if (IS_ERR(result))
 		mlog_errno(PTR_ERR(result));
 
 bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	}
 
 	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
-	if (!IS_ERR(parent))
-		d_set_d_op(parent, &ocfs2_dentry_ops);
 
 bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bdadbae0909..a6651956482 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
 			    loff_t len)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
+	int cmd = OCFS2_IOC_RESVSP64;
 
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
-
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		change_size = 0;
 
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = OCFS2_IOC_UNRESVSP64;
+
 	sr.l_whence = 0;
 	sr.l_start = (s64)offset;
 	sr.l_len = (s64)len;
 
-	return __ocfs2_change_file_space(NULL, inode, offset,
-					 OCFS2_IOC_RESVSP64, &sr, change_size);
+	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+					 change_size);
 }
 
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2606,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
-	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
 
@@ -2638,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
+	.fallocate	= ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600d..4068c6c4c6f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	 * #1 and #2 can be simply solved by never taking the lock
 	 * here for system files (which are the only type we read
 	 * during mount). It's a heavier approach, but our main
-	 * concern is user-accesible files anyway.
+	 * concern is user-accessible files anyway.
 	 *
 	 * #3 works itself out because we'll eventually take the
 	 * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d14cad6e2e4..849fb4a2e81 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	spin_unlock(&oi->ip_lock);
 
 bail_add:
-	d_set_d_op(dentry, &ocfs2_dentry_ops);
 	ret = d_splice_alias(inode, dentry);
 
 	if (inode) {
@@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
 		mlog_errno(status);
 		goto leave;
 	}
-	d_set_d_op(dentry, &ocfs2_dentry_ops);
 
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry,
 	}
 
 	ihold(inode);
-	d_set_d_op(dentry, &ocfs2_dentry_ops);
 	d_instantiate(dentry, inode);
 
 out_commit:
@@ -1017,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 		 * An error return must mean that no cluster locks
 		 * were held on function exit.
 		 */
-		if (oi1->ip_blkno != oi2->ip_blkno)
+		if (oi1->ip_blkno != oi2->ip_blkno) {
 			ocfs2_inode_unlock(inode2, 1);
+			brelse(*bh2);
+			*bh2 = NULL;
+		}
 
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -1794,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
 		mlog_errno(status);
 		goto bail;
 	}
-	d_set_d_op(dentry, &ocfs2_dentry_ops);
 
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 		goto out_commit;
 	}
 
-	d_set_d_op(dentry, &ocfs2_dentry_ops);
 	d_instantiate(dentry, inode);
 	status = 0;
 out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 70dd3b1798f..51cd6898e7f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -420,6 +420,11 @@ struct ocfs2_super
 	struct inode			*osb_tl_inode;
 	struct buffer_head		*osb_tl_bh;
 	struct delayed_work		osb_truncate_log_wq;
+	/*
+	 * How many clusters in our truncate log.
+	 * It must be protected by osb_tl_inode->i_mutex.
+	 */
+	unsigned int truncated_clusters;
 
 	struct ocfs2_node_map		osb_recovering_orphan_dirs;
 	unsigned int			*osb_orphan_wipes;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de763..71998d4d61d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	if (res->sr_bg_blkno) {
 		/* Attempt to short-circuit the usual search mechanism
 		 * by jumping straight to the most recently used
-		 * allocation group. This helps us mantain some
+		 * allocation group. This helps us maintain some
 		 * contiguousness across allocations. */
 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
 						min_bits, res, &bits_left);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 17ff46fa8a1..38f986d2447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -993,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 }
 
 /* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-			  char *path)
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
 {
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1013,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
 }
 
 static const struct quotactl_ops ocfs2_quotactl_ops = {
-	.quota_on	= ocfs2_quota_on,
+	.quota_on_meta	= ocfs2_quota_on,
 	.quota_off	= ocfs2_quota_off,
 	.quota_sync	= dquot_quota_sync,
 	.get_info	= dquot_get_dqinfo,
@@ -2097,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
+	sb->s_d_op = &ocfs2_dentry_ops;
 	sb->s_export_op = &ocfs2_export_ops;
 	sb->s_qcop = &ocfs2_quotactl_ops;
 	sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023..e52389e1f05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		return -EINVAL;
 
 	/* Return error if mode is not supported */
-	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	/* Punch hole must have keep size set */
+	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+	    !(mode & FALLOC_FL_KEEP_SIZE))
 		return -EOPNOTSUPP;
 
 	if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 		return -EFBIG;
 
-	if (!inode->i_op->fallocate)
+	if (!file->f_op->fallocate)
 		return -EOPNOTSUPP;
 
-	return inode->i_op->fallocate(inode, mode, offset, len);
+	return file->f_op->fallocate(file, mode, offset, len);
 }
 
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e..9c21119512b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_ro_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
+
 ssize_t part_alignment_offset_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
 		   NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_ro.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
@@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	put_device(part_to_dev(part));
 }
 
+void __delete_partition(struct hd_struct *part)
+{
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+	hd_struct_put(part);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!dev_get_uevent_suppress(ddev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
+	hd_ref_init(p);
 	return p;
 
 out_free_info:
@@ -507,65 +522,6 @@ out_put:
 	return ERR_PTR(err);
 }
 
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
-	struct device *ddev = disk_to_dev(disk);
-	struct block_device *bdev;
-	struct disk_part_iter piter;
-	struct hd_struct *part;
-	int err;
-
-	ddev->parent = disk->driverfs_dev;
-
-	dev_set_name(ddev, disk->disk_name);
-
-	/* delay uevents, until we scanned partition table */
-	dev_set_uevent_suppress(ddev, 1);
-
-	if (device_add(ddev))
-		return;
-	if (!sysfs_deprecated) {
-		err = sysfs_create_link(block_depr, &ddev->kobj,
-					kobject_name(&ddev->kobj));
-		if (err) {
-			device_del(ddev);
-			return;
-		}
-	}
-	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
-	/* No minors to use for partitions */
-	if (!disk_partitionable(disk))
-		goto exit;
-
-	/* No such device (e.g., media were just removed) */
-	if (!get_capacity(disk))
-		goto exit;
-
-	bdev = bdget_disk(disk, 0);
-	if (!bdev)
-		goto exit;
-
-	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ);
-	if (err < 0)
-		goto exit;
-	blkdev_put(bdev, FMODE_READ);
-
-exit:
-	/* announce disk after possible partitions are created */
-	dev_set_uevent_suppress(ddev, 0);
-	kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
-	/* announce possible partitions */
-	disk_part_iter_init(&piter, disk, 0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-	disk_part_iter_exit(&piter);
-}
-
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
 	const struct block_device_operations *bdops = disk->fops;
@@ -728,33 +684,3 @@ fail:
 }
 
 EXPORT_SYMBOL(read_dev_sector);
-
-void del_gendisk(struct gendisk *disk)
-{
-	struct disk_part_iter piter;
-	struct hd_struct *part;
-
-	/* invalidate stuff */
-	disk_part_iter_init(&piter, disk,
-			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-	while ((part = disk_part_iter_next(&piter))) {
-		invalidate_partition(disk, part->partno);
-		delete_partition(disk, part->partno);
-	}
-	disk_part_iter_exit(&piter);
-
-	invalidate_partition(disk, 0);
-	blk_free_devt(disk_to_dev(disk)->devt);
-	set_capacity(disk, 0);
-	disk->flags &= ~GENHD_FL_UP;
-	unlink_gendisk(disk);
-	part_stat_set_all(&disk->part0, 0);
-	disk->part0.stamp = 0;
-
-	kobject_put(disk->part0.holder_dir);
-	kobject_put(disk->slave_dir);
-	disk->driverfs_dev = NULL;
-	if (!sysfs_deprecated)
-		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-	device_del(disk_to_dev(disk));
-}
diff --git a/fs/pipe.c b/fs/pipe.c
index 68f1f8e4e23..da42f7db50d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(&pipe->wait);
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
 		pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
-		wake_up_interruptible_sync(&pipe->wait);
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(&pipe->wait);
+			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
@@ -623,7 +623,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
-		wake_up_interruptible_sync(&pipe->wait);
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
-		wake_up_interruptible_sync(&pipe->wait);
+		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -1004,7 +1004,6 @@ struct file *create_write_pipe(int flags)
 		goto err_inode;
 	path.mnt = mntget(pipe_mnt);
 
-	d_set_d_op(path.dentry, &pipefs_dentry_operations);
 	d_instantiate(path.dentry, inode);
 
 	err = -ENFILE;
@@ -1266,7 +1265,8 @@ static const struct super_operations pipefs_ops = {
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
+	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+			&pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 
 static struct file_system_type pipe_fs_type = {
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
 	unregister_filesystem(&pipe_fs_type);
-	mntput_long(pipe_mnt);
+	mntput(pipe_mnt);
 }
 
 fs_initcall(init_pipe_fs);
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d9..15af6222f8a 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
 config PROC_FS
-	bool "/proc file system support" if EMBEDDED
+	bool "/proc file system support" if EXPERT
 	default y
 	help
 	  This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
         Exports the dump image of crashed kernel in ELF format.
 
 config PROC_SYSCTL
-	bool "Sysctl support (/proc/sys)" if EMBEDDED
+	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
 	select SYSCTL
 	default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
 config PROC_PAGE_MONITOR
  	default y
 	depends on PROC_FS && MMU
-	bool "Enable /proc page monitoring" if EMBEDDED
+	bool "Enable /proc page monitoring" if EXPERT
  	help
 	  Various /proc files exist to monitor process memory utilization:
 	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 288a49e098b..df434c5f28f 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,12 +10,12 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o
 proc-y	+= cmdline.o
+proc-y	+= consoles.o
 proc-y	+= cpuinfo.o
 proc-y	+= devices.o
 proc-y	+= interrupts.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
-proc-y	+= proc_console.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676a..df2b703b9d0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 
 	get_task_comm(tcomm, p);
 
-	seq_printf(m, "Name:\t");
+	seq_puts(m, "Name:\t");
 	end = m->buf + m->size;
 	buf = m->buf + m->count;
 	name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 		buf++;
 	}
 	m->count = buf - m->buf;
-	seq_printf(m, "\n");
+	seq_putc(m, '\n');
 }
 
 /*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		seq_printf(m, "%d ", GROUP_AT(group_info, g));
 	put_cred(cred);
 
-	seq_printf(m, "\n");
+	seq_putc(m, '\n');
 }
 
 static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
 {
 	int i;
 
-	seq_printf(m, "%s", header);
+	seq_puts(m, header);
 
 	i = _NSIG;
 	do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
 		seq_printf(m, "%x", x);
 	} while (i >= 4);
 
-	seq_printf(m, "\n");
+	seq_putc(m, '\n');
 }
 
 static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 {
 	unsigned __capi;
 
-	seq_printf(m, "%s", header);
+	seq_puts(m, header);
 	CAP_FOR_EACH_U32(__capi) {
 		seq_printf(m, "%08x",
 			   a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
 	}
-	seq_printf(m, "\n");
+	seq_putc(m, '\n');
 }
 
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
 
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-	seq_printf(m, "Cpus_allowed:\t");
+	seq_puts(m, "Cpus_allowed:\t");
 	seq_cpumask(m, &task->cpus_allowed);
-	seq_printf(m, "\n");
-	seq_printf(m, "Cpus_allowed_list:\t");
+	seq_putc(m, '\n');
+	seq_puts(m, "Cpus_allowed_list:\t");
 	seq_cpumask_list(m, &task->cpus_allowed);
-	seq_printf(m, "\n");
+	seq_putc(m, '\n');
 }
 
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
 int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
-	int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+	unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
 	struct mm_struct *mm = get_task_mm(task);
 
 	if (mm) {
 		size = task_statm(mm, &shared, &text, &data, &resident);
 		mmput(mm);
 	}
-	seq_printf(m, "%d %d %d %d %d %d %d\n",
-			size, resident, shared, text, lib, data, 0);
+	seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+			size, resident, shared, text, data);
 
 	return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b20962c71a5..9d096e82b20 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 		return -ESRCH;
 	seq_puts(m, "Latency Top version : v0.1\n");
 	for (i = 0; i < 32; i++) {
-		if (task->latency_record[i].backtrace[0]) {
+		struct latency_record *lr = &task->latency_record[i];
+		if (lr->backtrace[0]) {
 			int q;
-			seq_printf(m, "%i %li %li ",
-				task->latency_record[i].count,
-				task->latency_record[i].time,
-				task->latency_record[i].max);
+			seq_printf(m, "%i %li %li",
+				   lr->count, lr->time, lr->max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_SYMBOL_LEN];
-				char *c;
-				if (!task->latency_record[i].backtrace[q])
+				unsigned long bt = lr->backtrace[q];
+				if (!bt)
 					break;
-				if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+				if (bt == ULONG_MAX)
 					break;
-				sprint_symbol(sym, task->latency_record[i].backtrace[q]);
-				c = strchr(sym, '+');
-				if (c)
-					*c = 0;
-				seq_printf(m, "%s ", sym);
+				seq_printf(m, " %ps", (void *)bt);
 			}
-			seq_printf(m, "\n");
+			seq_putc(m, '\n');
 		}
 
 	}
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
 
 static int proc_single_open(struct inode *inode, struct file *filp)
 {
-	int ret;
-	ret = single_open(filp, proc_single_show, NULL);
-	if (!ret) {
-		struct seq_file *m = filp->private_data;
-
-		m->private = inode;
-	}
-	return ret;
+	return single_open(filp, proc_single_show, inode);
 }
 
 static const struct file_operations proc_single_file_operations = {
@@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
-	if (oom_score_adj < task->signal->oom_score_adj &&
+	if (oom_score_adj < task->signal->oom_score_adj_min &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
@@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 			atomic_dec(&task->mm->oom_disable_count);
 	}
 	task->signal->oom_score_adj = oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
@@ -1386,15 +1375,7 @@ sched_write(struct file *file, const char __user *buf,
 
 static int sched_open(struct inode *inode, struct file *filp)
 {
-	int ret;
-
-	ret = single_open(filp, sched_show, NULL);
-	if (!ret) {
-		struct seq_file *m = filp->private_data;
-
-		m->private = inode;
-	}
-	return ret;
+	return single_open(filp, sched_show, inode);
 }
 
 static const struct file_operations proc_pid_sched_operations = {
@@ -1530,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
 
 static int comm_open(struct inode *inode, struct file *filp)
 {
-	int ret;
-
-	ret = single_open(filp, comm_show, NULL);
-	if (!ret) {
-		struct seq_file *m = filp->private_data;
-
-		m->private = inode;
-	}
-	return ret;
+	return single_open(filp, comm_show, inode);
 }
 
 static const struct file_operations proc_pid_set_comm_operations = {
diff --git a/fs/proc/proc_console.c b/fs/proc/consoles.c
index 8a707609f52..eafc22ab1fd 100644
--- a/fs/proc/proc_console.c
+++ b/fs/proc/consoles.c
@@ -106,9 +106,9 @@ static const struct file_operations proc_consoles_operations = {
 	.release	= seq_release,
 };
 
-static int register_proc_consoles(void)
+static int __init proc_consoles_init(void)
 {
 	proc_create("consoles", 0, NULL, &proc_consoles_operations);
 	return 0;
 }
-module_init(register_proc_consoles);
+module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c..b14347167c3 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
 
 	if (i < CHRDEV_MAJOR_HASH_SIZE) {
 		if (i == 0)
-			seq_printf(f, "Character devices:\n");
+			seq_puts(f, "Character devices:\n");
 		chrdev_show(f, i);
 	}
 #ifdef CONFIG_BLOCK
 	else {
 		i -= CHRDEV_MAJOR_HASH_SIZE;
 		if (i == 0)
-			seq_printf(f, "\nBlock devices:\n");
+			seq_puts(f, "\nBlock devices:\n");
 		blkdev_show(f, i);
 	}
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f766be29d2c..01e07f2a188 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		if (de->namelen != dentry->d_name.len)
 			continue;
 		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-			unsigned int ino;
-
-			ino = de->low_ino;
 			pde_get(de);
 			spin_unlock(&proc_subdir_lock);
 			error = -EINVAL;
-			inode = proc_get_inode(dir->i_sb, ino, de);
+			inode = proc_get_inode(dir->i_sb, de);
 			goto out_unlock;
 		}
 	}
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-	unsigned int ino = de->low_ino;
-
-	if (ino < PROC_DYNAMIC_FIRST)
-		return;
-
-	release_inode_number(ino);
+	release_inode_number(de->low_ino);
 
 	if (S_ISLNK(de->mode))
 		kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 
 		wait_for_completion(de->pde_unload_completion);
 
-		goto continue_removing;
+		spin_lock(&de->pde_unload_lock);
 	}
-	spin_unlock(&de->pde_unload_lock);
 
-continue_removing:
-	spin_lock(&de->pde_unload_lock);
 	while (!list_empty(&de->pde_openers)) {
 		struct pde_opener *pdeo;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6bcb926b101..176ce4cda68 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -416,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
-				struct proc_dir_entry *de)
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 {
 	struct inode * inode;
 
-	inode = iget_locked(sb, ino);
+	inode = iget_locked(sb, de->low_ino);
 	if (!inode)
 		return NULL;
 	if (inode->i_state & I_NEW) {
@@ -471,7 +470,7 @@ int proc_fill_super(struct super_block *s)
 	s->s_time_gran = 1;
 	
 	pde_get(&proc_root);
-	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+	root_inode = proc_get_inode(s, &proc_root);
 	if (!root_inode)
 		goto out_no_root;
 	root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd1..9ad561ded40 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
+unsigned long task_statm(struct mm_struct *,
+	unsigned long *, unsigned long *, unsigned long *, unsigned long *);
 void task_mem(struct seq_file *, struct mm_struct *);
 
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
 
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 
 /*
  * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468..d245cb23dd7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
-	.llseek		= generic_file_llseek,
+	.llseek		= default_llseek,
 };
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97..ed257d14156 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
 		"HardwareCorrupted: %5lu kB\n"
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		"AnonHugePages:  %8lu kB\n"
+#endif
 		,
 		K(i.totalram),
 		K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
 		K(global_page_state(NR_WRITEBACK)),
-		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		  HPAGE_PMD_NR
+#endif
+		  ),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
 		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+		   HPAGE_PMD_NR)
+#endif
 		);
 
 	hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b4566033..6d8e6a9e93a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 			ppage = pfn_to_page(pfn);
 		else
 			ppage = NULL;
-		if (!ppage)
+		if (!ppage || PageSlab(ppage))
 			pcount = 0;
 		else
 			pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
 	if (PageHuge(page))
 		u |= 1 << KPF_HUGE;
 
-	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
-
 	/*
-	 * Caveats on high order pages:
-	 * PG_buddy will only be set on the head page; SLUB/SLQB do the same
-	 * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+	 * Caveats on high order pages: page->_count will only be set
+	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+	 * SLOB won't set PG_slab at all on compound pages.
 	 */
+	if (PageBuddy(page))
+		u |= 1 << KPF_BUDDY;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
 	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
-	u |= kpf_copy_bit(k, KPF_BUDDY,		PG_buddy);
 
 	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
 	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc86943..cb761f01030 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
 	}
 	switch (p->type) {
 	case TTY_DRIVER_TYPE_SYSTEM:
-		seq_printf(m, "system");
+		seq_puts(m, "system");
 		if (p->subtype == SYSTEM_TYPE_TTY)
-			seq_printf(m, ":/dev/tty");
+			seq_puts(m, ":/dev/tty");
 		else if (p->subtype == SYSTEM_TYPE_SYSCONS)
-			seq_printf(m, ":console");
+			seq_puts(m, ":console");
 		else if (p->subtype == SYSTEM_TYPE_CONSOLE)
-			seq_printf(m, ":vtmaster");
+			seq_puts(m, ":vtmaster");
 		break;
 	case TTY_DRIVER_TYPE_CONSOLE:
-		seq_printf(m, "console");
+		seq_puts(m, "console");
 		break;
 	case TTY_DRIVER_TYPE_SERIAL:
-		seq_printf(m, "serial");
+		seq_puts(m, "serial");
 		break;
 	case TTY_DRIVER_TYPE_PTY:
 		if (p->subtype == PTY_TYPE_MASTER)
-			seq_printf(m, "pty:master");
+			seq_puts(m, "pty:master");
 		else if (p->subtype == PTY_TYPE_SLAVE)
-			seq_printf(m, "pty:slave");
+			seq_puts(m, "pty:slave");
 		else
-			seq_printf(m, "pty");
+			seq_puts(m, "pty");
 		break;
 	default:
 		seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
 		/* pseudo-drivers first */
 		seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
 		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
-		seq_printf(m, "system:/dev/tty\n");
+		seq_puts(m, "system:/dev/tty\n");
 		seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
 		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
-		seq_printf(m, "system:console\n");
+		seq_puts(m, "system:console\n");
 #ifdef CONFIG_UNIX98_PTYS
 		seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
 		seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
-		seq_printf(m, "system\n");
+		seq_puts(m, "system\n");
 #endif
 #ifdef CONFIG_VT
 		seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
 		seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
-		seq_printf(m, "system:vtmaster\n");
+		seq_puts(m, "system:vtmaster\n");
 #endif
 	}
 
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c98..62604be9f58 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
 	int i, j;
 
-	seq_printf(p, "                    ");
+	seq_puts(p, "                    ");
 	for_each_possible_cpu(i)
 		seq_printf(p, "CPU%-8d", i);
-	seq_printf(p, "\n");
+	seq_putc(p, '\n');
 
 	for (i = 0; i < NR_SOFTIRQS; i++) {
 		seq_printf(p, "%12s:", softirq_to_name[i]);
 		for_each_possible_cpu(j)
 			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
-		seq_printf(p, "\n");
+		seq_putc(p, '\n');
 	}
 	return 0;
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93ba..1cffa2b8a2f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 	for (i = 0; i < NR_SOFTIRQS; i++)
 		seq_printf(p, " %u", per_softirq_sums[i]);
-	seq_printf(p, "\n");
+	seq_putc(p, '\n');
 
 	return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c126c83b9a4..60b914860f8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
 	return PAGE_SIZE * mm->total_vm;
 }
 
-int task_statm(struct mm_struct *mm, int *shared, int *text,
-	       int *data, int *resident)
+unsigned long task_statm(struct mm_struct *mm,
+			 unsigned long *shared, unsigned long *text,
+			 unsigned long *data, unsigned long *resident)
 {
 	*shared = get_mm_counter(mm, MM_FILEPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Anonymous:      %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
-		   "MMUPageSize:    %8lu kB\n",
+		   "MMUPageSize:    %8lu kB\n"
+		   "Locked:         %8lu kB\n",
 		   (vma->vm_end - vma->vm_start) >> 10,
 		   mss.resident >> 10,
 		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.anonymous >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
-		   vma_mmu_pagesize(vma) >> 10);
+		   vma_mmu_pagesize(vma) >> 10,
+		   (vma->vm_flags & VM_LOCKED) ?
+			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e6384..b535d3e5d5f 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
 	return vsize;
 }
 
-int task_statm(struct mm_struct *mm, int *shared, int *text,
-	       int *data, int *resident)
+unsigned long task_statm(struct mm_struct *mm,
+			 unsigned long *shared, unsigned long *text,
+			 unsigned long *data, unsigned long *resident)
 {
 	struct vm_area_struct *vma;
 	struct vm_region *region;
 	struct rb_node *p;
-	int size = kobjsize(mm);
+	unsigned long size = kobjsize(mm);
 
 	down_read(&mm->mmap_sem);
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efc..a2a622e079f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 
 void __quota_error(struct super_block *sb, const char *func,
-		  const char *fmt, ...)
+		   const char *fmt, ...)
 {
-	va_list args;
-
 	if (printk_ratelimit()) {
+		va_list args;
+		struct va_format vaf;
+
 		va_start(args, fmt);
-		printk(KERN_ERR "Quota error (device %s): %s: ",
-		       sb->s_id, func);
-		vprintk(fmt, args);
-		printk("\n");
+
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+		       sb->s_id, func, &vaf);
+
 		va_end(args);
 	}
 }
@@ -2185,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_resume);
 
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
-		      struct path *path)
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
+		   struct path *path)
 {
 	int error = security_quota_on(path->dentry);
 	if (error)
@@ -2200,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
 					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
-EXPORT_SYMBOL(dquot_quota_on_path);
-
-int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-{
-	struct path path;
-	int error;
-
-	error = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (!error) {
-		error = dquot_quota_on_path(sb, type, format_id, &path);
-		path_put(&path);
-	}
-	return error;
-}
 EXPORT_SYMBOL(dquot_quota_on);
 
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1ed..b34bdb25490 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
 }
 
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-		         void __user *addr)
+		         struct path *path)
 {
-	char *pathname;
-	int ret = -ENOSYS;
-
-	pathname = getname(addr);
-	if (IS_ERR(pathname))
-		return PTR_ERR(pathname);
-	if (sb->s_qcop->quota_on)
-		ret = sb->s_qcop->quota_on(sb, type, id, pathname);
-	putname(pathname);
-	return ret;
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_on_meta)
+		return sb->s_qcop->quota_on_meta(sb, type, id);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	return sb->s_qcop->quota_on(sb, type, id, path);
 }
 
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-		       void __user *addr)
+		       void __user *addr, struct path *path)
 {
 	int ret;
 
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 
 	switch (cmd) {
 	case Q_QUOTAON:
-		return quota_quotaon(sb, type, cmd, id, addr);
+		return quota_quotaon(sb, type, cmd, id, path);
 	case Q_QUOTAOFF:
 		if (!sb->s_qcop->quota_off)
 			return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
+	struct path path, *pathp = NULL;
 	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 		return -ENODEV;
 	}
 
+	/*
+	 * Path for quotaon has to be resolved before grabbing superblock
+	 * because that gets s_umount sem which is also possibly needed by path
+	 * resolution (think about autofs) and thus deadlocks could arise.
+	 */
+	if (cmds == Q_QUOTAON) {
+		ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+		if (ret)
+			pathp = ERR_PTR(ret);
+		else
+			pathp = &path;
+	}
+
 	sb = quotactl_block(special);
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
 
-	ret = do_quotactl(sb, type, cmds, id, addr);
+	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
 
 	drop_super(sb);
+	if (pathp && !IS_ERR(pathp))
+		path_put(pathp);
 	return ret;
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabc..e41c1becf09 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		return -ENOMEM;
 	ret = read_blk(info, *blk, buf);
 	if (ret < 0) {
-		quota_error(dquot->dq_sb, "Can't read quota data "
-			    "block %u", blk);
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    *blk);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		} else {
 			ret = write_blk(info, *blk, buf);
 			if (ret < 0)
-				quota_error(dquot->dq_sb, "Can't write quota "
-					    "tree block %u", blk);
+				quota_error(dquot->dq_sb,
+					    "Can't write quota tree block %u",
+					    *blk);
 		}
 	}
 out_buf:
diff --git a/fs/read_write.c b/fs/read_write.c
index 5d431bacbea..5520f8ad550 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -30,18 +30,9 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
-static int
-__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+static inline int unsigned_offsets(struct file *file)
 {
-	/*
-	 * pos or pos+count is negative here, check overflow.
-	 * too big "count" will be caught in rw_verify_area().
-	 */
-	if ((pos < 0) && (pos + count < pos))
-		return -EOVERFLOW;
-	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
-		return 0;
-	return -EINVAL;
+	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
 
 /**
@@ -75,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		break;
 	}
 
-	if (offset < 0 && __negative_fpos_check(file, offset, 0))
+	if (offset < 0 && !unsigned_offsets(file))
 		return -EINVAL;
 	if (offset > inode->i_sb->s_maxbytes)
 		return -EINVAL;
@@ -152,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
-	if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
+	if (offset >= 0 || unsigned_offsets(file)) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
@@ -252,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	if (unlikely((ssize_t) count < 0))
 		return retval;
 	pos = *ppos;
-	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
-		retval = __negative_fpos_check(file, pos, count);
-		if (retval)
+	if (unlikely(pos < 0)) {
+		if (!unsigned_offsets(file))
+			return retval;
+		if (count >= -pos) /* both values are in 0..LLONG_MAX */
+			return -EOVERFLOW;
+	} else if (unlikely((loff_t) (pos + count) < 0)) {
+		if (!unsigned_offsets(file))
 			return retval;
 	}
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d31bce1a9f9..3eea859e699 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2551,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
 	result = 0;
 
 	if (journal->j_dev_bd != NULL) {
-		if (journal->j_dev_bd->bd_dev != super->s_dev)
-			bd_release(journal->j_dev_bd);
 		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
@@ -2570,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
@@ -2584,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
 
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		if (jdev == super->s_dev)
+			blkdev_mode &= ~FMODE_EXCL;
+		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+						      journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2593,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
-		} else if (jdev != super->s_dev) {
-			result = bd_claim(journal->j_dev_bd, journal);
-			if (result) {
-				blkdev_put(journal->j_dev_bd, blkdev_mode);
-				return result;
-			}
-
+		} else if (jdev != super->s_dev)
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
-		}
 
 		return 0;
 	}
 
 	journal->j_dev_mode = blkdev_mode;
-	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
-						blkdev_mode, journal);
+	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f53851..45de98b5946 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int l
 	va_list args;
 	int mode, first, last;
 
-	va_start(args, bh);
-
 	if (!bh) {
 		printk("print_block: buffer is NULL\n");
 		return;
 	}
 
+	va_start(args, bh);
+
 	mode = va_arg(args, int);
 	first = va_arg(args, int);
 	last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2575682a9ea..0aab04f4682 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -632,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
 
 static const struct dquot_operations reiserfs_quota_operations = {
 	.write_dquot = reiserfs_write_dquot,
@@ -2048,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     char *name)
+			     struct path *path)
 {
 	int err;
-	struct path path;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
 
 	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
 		return -EINVAL;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	if (err)
-		return err;
 	/* Quotafile not on the same filesystem? */
-	if (path.mnt->mnt_sb != sb) {
+	if (path->mnt->mnt_sb != sb) {
 		err = -EXDEV;
 		goto out;
 	}
-	inode = path.dentry->d_inode;
+	inode = path->dentry->d_inode;
 	/* We must not pack tails for quota files on reiserfs for quota IO to work */
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
@@ -2082,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	/* Journaling quota? */
 	if (REISERFS_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (path.dentry->d_parent != sb->s_root)
+		if (path->dentry->d_parent != sb->s_root)
 			reiserfs_warning(sb, "super-6521",
 				 "Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
@@ -2101,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			goto out;
 	}
-	err = dquot_quota_on_path(sb, type, format_id, &path);
+	err = dquot_quota_on(sb, type, format_id, path);
 out:
-	path_put(&path);
 	return err;
 }
 
diff --git a/fs/select.c b/fs/select.c
index b7b10aa3086..e56560d2b08 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 		rts.tv_sec = rts.tv_nsec = 0;
 
 	if (timeval) {
+		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+			memset(&rtv, 0, sizeof(rtv));
 		rtv.tv_sec = rts.tv_sec;
 		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
 
diff --git a/fs/splice.c b/fs/splice.c
index ce2f02579e3..50a5d978da1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 {
 	struct file *file = sd->u.file;
 	loff_t pos = sd->pos;
-	int ret, more;
-
-	ret = buf->ops->confirm(pipe, buf);
-	if (!ret) {
-		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-		if (file->f_op && file->f_op->sendpage)
-			ret = file->f_op->sendpage(file, buf->page, buf->offset,
-						   sd->len, &pos, more);
-		else
-			ret = -EINVAL;
-	}
+	int more;
 
-	return ret;
+	if (!likely(file->f_op && file->f_op->sendpage))
+		return -EINVAL;
+
+	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+	return file->f_op->sendpage(file, buf->page, buf->offset,
+				    sd->len, &pos, more);
 }
 
 /*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	void *fsdata;
 	int ret;
 
-	/*
-	 * make sure the data in this buffer is uptodate
-	 */
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
-
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
 	this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 		if (sd->len > sd->total_len)
 			sd->len = sd->total_len;
 
-		ret = actor(pipe, buf, sd);
-		if (ret <= 0) {
+		ret = buf->ops->confirm(pipe, buf);
+		if (unlikely(ret)) {
 			if (ret == -ENODATA)
 				ret = 0;
 			return ret;
 		}
+
+		ret = actor(pipe, buf, sd);
+		if (ret <= 0)
+			return ret;
+
 		buf->offset += ret;
 		buf->len -= ret;
 
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	int ret;
 	void *data;
 
-	ret = buf->ops->confirm(pipe, buf);
-	if (ret)
-		return ret;
-
 	data = buf->ops->map(pipe, buf, 0);
 	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
 	buf->ops->unmap(pipe, buf, data);
@@ -1495,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	char *src;
 	int ret;
 
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
-
 	/*
 	 * See if we can use the atomic maps, by prefaulting in the
 	 * pages and doing an atomic copy
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d0..aa68a8a3151 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
 config SQUASHFS_XATTR
 	bool "Squashfs XATTR support"
 	depends on SQUASHFS
-	default n
 	help
 	  Saying Y here includes support for extended attributes (xattrs).
 	  Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
 config SQUASHFS_LZO
 	bool "Include support for LZO compressed file systems"
 	depends on SQUASHFS
-	default n
 	select LZO_DECOMPRESS
 	help
 	  Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
 
 	  If unsure, say N.
 
+config SQUASHFS_XZ
+	bool "Include support for XZ compressed file systems"
+	depends on SQUASHFS
+	select XZ_DEC
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with XZ compresssion.  XZ gives better compression than
+	  the default zlib compression, at the expense of greater CPU and
+	  memory overhead.
+
+	  XZ is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
 config SQUASHFS_EMBEDDED
 	bool "Additional option for memory-constrained systems"
 	depends on SQUASHFS
-	default n
 	help
 	  Saying Y here allows you to specify cache size.
 
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d32..cecf2bea07a 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb84..2fb2882f0fa 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee905..26b15ae34d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 
 /*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722..a5940e54c4d 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "decompressor.h"
 #include "squashfs.h"
 
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
 };
 
 #ifndef CONFIG_SQUASHFS_LZO
-static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
 	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
 
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
+
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 	NULL, NULL, NULL, 0, "unknown", 0
 };
 
 static const struct squashfs_decompressor *decompressor[] = {
 	&squashfs_zlib_comp_ops,
-	&squashfs_lzma_unsupported_comp_ops,
-#ifdef CONFIG_SQUASHFS_LZO
 	&squashfs_lzo_comp_ops,
-#else
-	&squashfs_lzo_unsupported_comp_ops,
-#endif
+	&squashfs_xz_comp_ops,
+	&squashfs_lzma_unsupported_comp_ops,
 	&squashfs_unknown_comp_ops
 };
 
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f..3b305a70f7a 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
 	return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
 		length, srclength, pages);
 }
+
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
+
 #endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879..7eef571443c 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 
 /*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b7..d8f32452638 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 
 /*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c..7da759e34c5 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f7..ba729d80887 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
 
 #define WARNING(s, args...)	pr_warning("SQUASHFS: "s, ## args)
 
-static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
-{
-	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
-}
-
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
 				int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
-
-/* lzo_wrapper.c */
-extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab1..39533feffd6 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
 #define ZLIB_COMPRESSION	1
 #define LZMA_COMPRESSION	2
 #define LZO_COMPRESSION		3
+#define XZ_COMPRESSION		4
 
 struct squashfs_super_block {
 	__le32			s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a..359baefc01f 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
 	};
 	struct inode	vfs_inode;
 };
+
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
 #endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c3..05385dbe146 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
 
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 00000000000..856756ca5ee
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,153 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+struct squashfs_xz {
+	struct xz_dec *state;
+	struct xz_buf buf;
+};
+
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+{
+	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+
+	struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+
+	stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+	if (stream->state == NULL)
+		goto failed;
+
+	return stream;
+
+failed:
+	ERROR("Failed to allocate xz workspace\n");
+	kfree(stream);
+	return NULL;
+}
+
+
+static void squashfs_xz_free(void *strm)
+{
+	struct squashfs_xz *stream = strm;
+
+	if (stream) {
+		xz_dec_end(stream->state);
+		kfree(stream);
+	}
+}
+
+
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	enum xz_ret xz_err;
+	int avail, total = 0, k = 0, page = 0;
+	struct squashfs_xz *stream = msblk->stream;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	xz_dec_reset(stream->state);
+	stream->buf.in_pos = 0;
+	stream->buf.in_size = 0;
+	stream->buf.out_pos = 0;
+	stream->buf.out_size = PAGE_CACHE_SIZE;
+	stream->buf.out = buffer[page++];
+
+	do {
+		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+			avail = min(length, msblk->devblksize - offset);
+			length -= avail;
+			wait_on_buffer(bh[k]);
+			if (!buffer_uptodate(bh[k]))
+				goto release_mutex;
+
+			if (avail == 0) {
+				offset = 0;
+				put_bh(bh[k++]);
+				continue;
+			}
+
+			stream->buf.in = bh[k]->b_data + offset;
+			stream->buf.in_size = avail;
+			stream->buf.in_pos = 0;
+			offset = 0;
+		}
+
+		if (stream->buf.out_pos == stream->buf.out_size
+							&& page < pages) {
+			stream->buf.out = buffer[page++];
+			stream->buf.out_pos = 0;
+			total += PAGE_CACHE_SIZE;
+		}
+
+		xz_err = xz_dec_run(stream->state, &stream->buf);
+
+		if (stream->buf.in_pos == stream->buf.in_size && k < b)
+			put_bh(bh[k++]);
+	} while (xz_err == XZ_OK);
+
+	if (xz_err != XZ_STREAM_END) {
+		ERROR("xz_dec_run error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	if (k < b) {
+		ERROR("xz_uncompress error, input remaining\n");
+		goto release_mutex;
+	}
+
+	total += stream->buf.out_pos;
+	mutex_unlock(&msblk->read_data_mutex);
+	return total;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+	.init = squashfs_xz_init,
+	.free = squashfs_xz_free,
+	.decompress = squashfs_xz_uncompress,
+	.id = XZ_COMPRESSION,
+	.name = "xz",
+	.supported = 1
+};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e48..818a5e063fa 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 	struct buffer_head **bh, int b, int offset, int length, int srclength,
 	int pages)
 {
-	int zlib_err = 0, zlib_init = 0;
-	int avail, bytes, k = 0, page = 0;
+	int zlib_err, zlib_init = 0;
+	int k = 0, page = 0;
 	z_stream *stream = msblk->stream;
 
 	mutex_lock(&msblk->read_data_mutex);
@@ -75,11 +74,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 	stream->avail_out = 0;
 	stream->avail_in = 0;
 
-	bytes = length;
 	do {
 		if (stream->avail_in == 0 && k < b) {
-			avail = min(bytes, msblk->devblksize - offset);
-			bytes -= avail;
+			int avail = min(length, msblk->devblksize - offset);
+			length -= avail;
 			wait_on_buffer(bh[k]);
 			if (!buffer_uptodate(bh[k]))
 				goto release_mutex;
@@ -128,6 +126,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 		goto release_mutex;
 	}
 
+	if (k < b) {
+		ERROR("zlib_uncompress error, data remaining\n");
+		goto release_mutex;
+	}
+
 	length = stream->total_out;
 	mutex_unlock(&msblk->read_data_mutex);
 	return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e21390..d5c61cf2b70 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	int lookup_flags = 0;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
 		goto out;
 
 	if (!(flag & AT_SYMLINK_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
+	if (flag & AT_NO_AUTOMOUNT)
+		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
diff --git a/fs/super.c b/fs/super.c
index 823e061faa8..74e149efed8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -767,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -802,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 
 		/*
 		 * s_umount nests inside bd_mutex during
-		 * __invalidate_device().  close_bdev_exclusive()
-		 * acquires bd_mutex and can't be called under
-		 * s_umount.  Drop s_umount temporarily.  This is safe
-		 * as we're holding an active reference.
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -832,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode);
 error:
 	return ERR_PTR(error);
 }
@@ -863,7 +863,8 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_exclusive(bdev, mode);
+	WARN_ON_ONCE(!(mode & FMODE_EXCL));
+	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
 EXPORT_SYMBOL(kill_block_super);
@@ -1140,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 	return mnt;
 
  err:
-	mntput_long(mnt);
+	mntput(mnt);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d..8c41feacbac 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
 config SYSFS
-	bool "sysfs file system support" if EMBEDDED
+	bool "sysfs file system support" if EXPERT
 	default y
 	help
 	The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af..c8769dc222d 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
 	struct attribute *const *attr;
 	int i;
 
-	if (grp)
-		dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-	else
-		dir_sd = sysfs_get(kobj->sd);
+	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 	struct attribute *const *attr;
 
-	if (grp)
-		dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-	else
-		dir_sd = sysfs_get(kobj->sd);
+	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
 	if (dir_sd) {
 		for (attr = grp->attrs; *attr; ++attr)
 			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 30ac2734558..0a12eb89cd3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ffaaa816bfb..3d28af31d86 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 
 struct sysfs_open_dirent;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b5e68da2db3..b427b1208c2 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -48,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
 	struct inode * inode = NULL;
 	ino_t ino;
 
-	d_set_d_op(dentry, dir->i_sb->s_root->d_op);
 	if (dentry->d_name.len > SYSV_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 	ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 76712aefc4a..f60c196913e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 	sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
 	/* set up enough so that it can read an inode */
 	sb->s_op = &sysv_sops;
+	if (sbi->s_forced_ro)
+		sb->s_flags |= MS_RDONLY;
+	if (sbi->s_truncate)
+		sb->s_d_op = &sysv_dentry_operations;
 	root_inode = sysv_iget(sb, SYSV_ROOT_INO);
 	if (IS_ERR(root_inode)) {
 		printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 		printk("SysV FS: get root dentry failed\n");
 		return 0;
 	}
-	if (sbi->s_forced_ro)
-		sb->s_flags |= MS_RDONLY;
-	if (sbi->s_truncate)
-		d_set_d_op(sb->s_root, &sysv_dentry_operations);
 	return 1;
 }
 
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4..0e0e99bd6bc 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
 config UDF_FS
 	tristate "UDF file system support"
-	depends on BKL # needs serious work to remove
 	select CRC_ITU_T
 	help
 	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4ce..306ee39ef2c 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 				udf_debug("bit %ld already set\n", bit + i);
 				udf_debug("byte=%2x\n",
 					((char *)bh->b_data)[(bit + i) >> 3]);
-			} else {
-				udf_add_free_space(sb, sbi->s_partition, 1);
 			}
 		}
+		udf_add_free_space(sb, sbi->s_partition, count);
 		mark_buffer_dirty(bh);
 		if (overflow) {
 			block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf5022..eb8bfe2b89a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 
 #include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode *dir = filp->f_path.dentry->d_inode;
 	int result;
 
-	lock_kernel();
-
 	if (filp->f_pos == 0) {
 		if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-			unlock_kernel();
 			return 0;
 		}
 		filp->f_pos++;
 	}
 
 	result = do_udf_readdir(dir, filp, filldir, dirent);
-	unlock_kernel();
  	return result;
 }
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c..89c78486cbb 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	size_t count = iocb->ki_left;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
+	down_write(&iinfo->i_data_sem);
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
 		if (file->f_flags & O_APPEND)
 			pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			udf_expand_file_adinicb(inode, pos + count, &err);
 			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
 				udf_debug("udf_expand_adinicb: err=%d\n", err);
+				up_write(&iinfo->i_data_sem);
 				return err;
 			}
 		} else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				iinfo->i_lenAlloc = inode->i_size;
 		}
 	}
+	up_write(&iinfo->i_data_sem);
 
 	retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
 	if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	long old_block, new_block;
 	int result = -EINVAL;
 
-	lock_kernel();
-
 	if (file_permission(filp, MAY_READ) != 0) {
 		udf_debug("no permission to access inode %lu\n", inode->i_ino);
 		result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 
 out:
-	unlock_kernel();
 	return result;
 }
 
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->f_mode & FMODE_WRITE) {
 		mutex_lock(&inode->i_mutex);
-		lock_kernel();
+		down_write(&UDF_I(inode)->i_data_sem);
 		udf_discard_prealloc(inode);
 		udf_truncate_tail_extent(inode);
-		unlock_kernel();
+		up_write(&UDF_I(inode)->i_data_sem);
 		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc..6fb7e0adcda 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		return NULL;
 	}
 
-	mutex_lock(&sbi->s_alloc_mutex);
 	if (sbi->s_lvid_bh) {
-		struct logicalVolIntegrityDesc *lvid =
-			(struct logicalVolIntegrityDesc *)
-			sbi->s_lvid_bh->b_data;
-		struct logicalVolIntegrityDescImpUse *lvidiu =
-							udf_sb_lvidiu(sbi);
-		struct logicalVolHeaderDesc *lvhd;
-		uint64_t uniqueID;
-		lvhd = (struct logicalVolHeaderDesc *)
-				(lvid->logicalVolContentsUse);
+		struct logicalVolIntegrityDescImpUse *lvidiu;
+
+		iinfo->i_unique = lvid_get_unique_id(sb);
+		mutex_lock(&sbi->s_alloc_mutex);
+		lvidiu = udf_sb_lvidiu(sbi);
 		if (S_ISDIR(mode))
 			le32_add_cpu(&lvidiu->numDirs, 1);
 		else
 			le32_add_cpu(&lvidiu->numFiles, 1);
-		iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
-		if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-			uniqueID += 16;
-		lvhd->uniqueID = cpu_to_le64(uniqueID);
 		udf_updated_lvid(sb);
+		mutex_unlock(&sbi->s_alloc_mutex);
 	}
-	mutex_unlock(&sbi->s_alloc_mutex);
 
 	inode_init_owner(inode, dir, mode);
 
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2d..c6a2e782b97 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
 
 #include "udfdecl.h"
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
 					sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
 		want_delete = 1;
 		inode->i_size = 0;
 		udf_truncate(inode);
-		lock_kernel();
 		udf_update_inode(inode, IS_SYNC(inode));
-		unlock_kernel();
 	}
 	invalidate_inode_buffers(inode);
 	end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
 	if (want_delete) {
-		lock_kernel();
 		udf_free_inode(inode);
-		unlock_kernel();
 	}
 }
 
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
 	err = -EIO;
 	new = 0;
 	bh = NULL;
-
-	lock_kernel();
-
 	iinfo = UDF_I(inode);
+
+	down_write(&iinfo->i_data_sem);
 	if (block == iinfo->i_next_alloc_block + 1) {
 		iinfo->i_next_alloc_block++;
 		iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
 	map_bh(bh_result, inode->i_sb, phys);
 
 abort:
-	unlock_kernel();
+	up_write(&iinfo->i_data_sem);
 	return err;
 }
 
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	lock_kernel();
 	iinfo = UDF_I(inode);
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		down_write(&iinfo->i_data_sem);
 		if (inode->i_sb->s_blocksize <
 				(udf_file_entry_alloc_offset(inode) +
 				 inode->i_size)) {
 			udf_expand_file_adinicb(inode, inode->i_size, &err);
 			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
 				inode->i_size = iinfo->i_lenAlloc;
-				unlock_kernel();
+				up_write(&iinfo->i_data_sem);
 				return;
 			} else
 				udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
 				offset - udf_file_entry_alloc_offset(inode));
 			iinfo->i_lenAlloc = inode->i_size;
 		}
+		up_write(&iinfo->i_data_sem);
 	} else {
 		block_truncate_page(inode->i_mapping, inode->i_size,
 				    udf_get_block);
+		down_write(&iinfo->i_data_sem);
 		udf_truncate_extents(inode);
+		up_write(&iinfo->i_data_sem);
 	}
 
 	inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
 		udf_sync_inode(inode);
 	else
 		mark_inode_dirty(inode);
-	unlock_kernel();
 }
 
 static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		return;
 	}
 
+	read_lock(&sbi->s_cred_lock);
 	inode->i_uid = le32_to_cpu(fe->uid);
 	if (inode->i_uid == -1 ||
 	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	    UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
 		inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
 
-	inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-	if (!inode->i_nlink)
-		inode->i_nlink = 1;
-
-	inode->i_size = le64_to_cpu(fe->informationLength);
-	iinfo->i_lenExtents = inode->i_size;
-
 	if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
 			sbi->s_fmode != UDF_INVALID_MODE)
 		inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	else
 		inode->i_mode = udf_convert_permissions(fe);
 	inode->i_mode &= ~sbi->s_umask;
+	read_unlock(&sbi->s_cred_lock);
+
+	inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+	if (!inode->i_nlink)
+		inode->i_nlink = 1;
+
+	inode->i_size = le64_to_cpu(fe->informationLength);
+	iinfo->i_lenExtents = inode->i_size;
 
 	if (iinfo->i_efe == 0) {
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	int ret;
-
-	lock_kernel();
-	ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-	unlock_kernel();
-
-	return ret;
+	return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
-int udf_sync_inode(struct inode *inode)
+static int udf_sync_inode(struct inode *inode)
 {
 	return udf_update_inode(inode, 1);
 }
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
 	struct extent_position epos = {};
 	int ret;
 
-	lock_kernel();
+	down_read(&UDF_I(inode)->i_data_sem);
 
 	if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
 						(EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
 	else
 		ret = 0;
 
-	unlock_kernel();
+	up_read(&UDF_I(inode)->i_data_sem);
 	brelse(epos.bh);
 
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baeb..2be0f9eb86d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 		}
 
 		if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
-		    isdotdot) {
-			brelse(epos.bh);
-			return fi;
-		}
+		    isdotdot)
+			goto out_ok;
 
 		if (!lfi)
 			continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > UDF_NAME_LEN - 2)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	lock_kernel();
 #ifdef UDF_RECOVERY
 	/* temporary shorthand for specifying files by inode number */
 	if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 		};
 		inode = udf_iget(dir->i_sb, lb);
 		if (!inode) {
-			unlock_kernel();
 			return ERR_PTR(-EACCES);
 		}
 	} else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 		loc = lelb_to_cpu(cfi.icb.extLocation);
 		inode = udf_iget(dir->i_sb, &loc);
 		if (!inode) {
-			unlock_kernel();
 			return ERR_PTR(-EACCES);
 		}
 	}
-	unlock_kernel();
 
 	return d_splice_alias(inode, dentry);
 }
@@ -476,15 +469,19 @@ add:
 				f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
 		if (!fibh->ebh)
 			goto out_err;
+		/* Extents could have been merged, invalidate our position */
+		brelse(epos.bh);
+		epos.bh = NULL;
+		epos.block = dinfo->i_location;
+		epos.offset = udf_file_entry_alloc_offset(dir);
 
 		if (!fibh->soffset) {
-			if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
-			    (EXT_RECORDED_ALLOCATED >> 30)) {
-				block = eloc.logicalBlockNum + ((elen - 1) >>
+			/* Find the freshly allocated block */
+			while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+				(EXT_RECORDED_ALLOCATED >> 30))
+				;
+			block = eloc.logicalBlockNum + ((elen - 1) >>
 					dir->i_sb->s_blocksize_bits);
-			} else
-				block++;
-
 			brelse(fibh->sbh);
 			fibh->sbh = fibh->ebh;
 			fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
 	int err;
 	struct udf_inode_info *iinfo;
 
-	lock_kernel();
 	inode = udf_new_inode(dir, mode, &err);
 	if (!inode) {
-		unlock_kernel();
 		return err;
 	}
 
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
 		inode->i_nlink--;
 		mark_inode_dirty(inode);
 		iput(inode);
-		unlock_kernel();
 		return err;
 	}
 	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
-	unlock_kernel();
 	d_instantiate(dentry, inode);
 
 	return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 
-	lock_kernel();
 	err = -EIO;
 	inode = udf_new_inode(dir, mode, &err);
 	if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		inode->i_nlink--;
 		mark_inode_dirty(inode);
 		iput(inode);
-		unlock_kernel();
 		return err;
 	}
 	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	err = 0;
 
 out:
-	unlock_kernel();
 	return err;
 }
 
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct udf_inode_info *dinfo = UDF_I(dir);
 	struct udf_inode_info *iinfo;
 
-	lock_kernel();
 	err = -EMLINK;
 	if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
 		goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = 0;
 
 out:
-	unlock_kernel();
 	return err;
 }
 
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	struct kernel_lb_addr tloc;
 
 	retval = -ENOENT;
-	lock_kernel();
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
 		goto out;
@@ -826,7 +813,6 @@ end_rmdir:
 	brelse(fibh.sbh);
 
 out:
-	unlock_kernel();
 	return retval;
 }
 
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 	struct kernel_lb_addr tloc;
 
 	retval = -ENOENT;
-	lock_kernel();
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
 		goto out;
@@ -870,7 +855,6 @@ end_unlink:
 	brelse(fibh.sbh);
 
 out:
-	unlock_kernel();
 	return retval;
 }
 
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	int block;
 	unsigned char *name = NULL;
 	int namelen;
-	struct buffer_head *bh;
 	struct udf_inode_info *iinfo;
+	struct super_block *sb = dir->i_sb;
 
-	lock_kernel();
 	inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
 	if (!inode)
 		goto out;
 
+	iinfo = UDF_I(inode);
+	down_write(&iinfo->i_data_sem);
 	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
 	if (!name) {
 		err = -ENOMEM;
 		goto out_no_entry;
 	}
 
-	iinfo = UDF_I(inode);
 	inode->i_data.a_ops = &udf_symlink_aops;
 	inode->i_op = &udf_symlink_inode_operations;
 
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		struct kernel_lb_addr eloc;
 		uint32_t bsize;
 
-		block = udf_new_block(inode->i_sb, inode,
+		block = udf_new_block(sb, inode,
 				iinfo->i_location.partitionReferenceNum,
 				iinfo->i_location.logicalBlockNum, &err);
 		if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		eloc.logicalBlockNum = block;
 		eloc.partitionReferenceNum =
 				iinfo->i_location.partitionReferenceNum;
-		bsize = inode->i_sb->s_blocksize;
+		bsize = sb->s_blocksize;
 		iinfo->i_lenExtents = bsize;
 		udf_add_aext(inode, &epos, &eloc, bsize, 0);
 		brelse(epos.bh);
 
-		block = udf_get_pblock(inode->i_sb, block,
+		block = udf_get_pblock(sb, block,
 				iinfo->i_location.partitionReferenceNum,
 				0);
-		epos.bh = udf_tgetblk(inode->i_sb, block);
+		epos.bh = udf_tgetblk(sb, block);
 		lock_buffer(epos.bh);
-		memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+		memset(epos.bh->b_data, 0x00, bsize);
 		set_buffer_uptodate(epos.bh);
 		unlock_buffer(epos.bh);
 		mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	} else
 		ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
 
-	eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode);
+	eoffset = sb->s_blocksize - udf_ext0_offset(inode);
 	pc = (struct pathComponent *)ea;
 
 	if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		}
 
 		if (pc->componentType == 5) {
-			namelen = udf_put_filename(inode->i_sb, compstart, name,
+			namelen = udf_put_filename(sb, compstart, name,
 						   symname - compstart);
 			if (!namelen)
 				goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi)
 		goto out_no_entry;
-	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+	cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
 	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-	bh = UDF_SB(inode->i_sb)->s_lvid_bh;
-	if (bh) {
-		struct logicalVolIntegrityDesc *lvid =
-				(struct logicalVolIntegrityDesc *)bh->b_data;
-		struct logicalVolHeaderDesc *lvhd;
-		uint64_t uniqueID;
-		lvhd = (struct logicalVolHeaderDesc *)
-				lvid->logicalVolContentsUse;
-		uniqueID = le64_to_cpu(lvhd->uniqueID);
+	if (UDF_SB(inode->i_sb)->s_lvid_bh) {
 		*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-			cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
-		if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-			uniqueID += 16;
-		lvhd->uniqueID = cpu_to_le64(uniqueID);
-		mark_buffer_dirty(bh);
+			cpu_to_le32(lvid_get_unique_id(sb));
 	}
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
 	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 		mark_inode_dirty(dir);
+	up_write(&iinfo->i_data_sem);
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 
 out:
 	kfree(name);
-	unlock_kernel();
 	return err;
 
 out_no_entry:
+	up_write(&iinfo->i_data_sem);
 	inode_dec_link_count(inode);
 	iput(inode);
 	goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	struct udf_fileident_bh fibh;
 	struct fileIdentDesc cfi, *fi;
 	int err;
-	struct buffer_head *bh;
 
-	lock_kernel();
 	if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
-		unlock_kernel();
 		return -EMLINK;
 	}
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
-		unlock_kernel();
 		return err;
 	}
 	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
 	cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
-	bh = UDF_SB(inode->i_sb)->s_lvid_bh;
-	if (bh) {
-		struct logicalVolIntegrityDesc *lvid =
-				(struct logicalVolIntegrityDesc *)bh->b_data;
-		struct logicalVolHeaderDesc *lvhd;
-		uint64_t uniqueID;
-		lvhd = (struct logicalVolHeaderDesc *)
-				(lvid->logicalVolContentsUse);
-		uniqueID = le64_to_cpu(lvhd->uniqueID);
+	if (UDF_SB(inode->i_sb)->s_lvid_bh) {
 		*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-			cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
-		if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-			uniqueID += 16;
-		lvhd->uniqueID = cpu_to_le64(uniqueID);
-		mark_buffer_dirty(bh);
+			cpu_to_le32(lvid_get_unique_id(inode->i_sb));
 	}
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
 	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	mark_inode_dirty(inode);
 	ihold(inode);
 	d_instantiate(dentry, inode);
-	unlock_kernel();
 
 	return 0;
 }
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct kernel_lb_addr tloc;
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
-	lock_kernel();
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	if (ofi) {
 		if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
 			brelse(nfibh.ebh);
 		brelse(nfibh.sbh);
 	}
-	unlock_kernel();
 
 	return retval;
 }
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
 	struct fileIdentDesc cfi;
 	struct udf_fileident_bh fibh;
 
-	lock_kernel();
 	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
 		goto out_unlock;
 
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
 	inode = udf_iget(child->d_inode->i_sb, &tloc);
 	if (!inode)
 		goto out_unlock;
-	unlock_kernel();
 
 	return d_obtain_alias(inode);
 out_unlock:
-	unlock_kernel();
 	return ERR_PTR(-EACCES);
 }
 
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0..a71090ea0e0 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
 			uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	u16 reallocationTableLen;
 	struct buffer_head *bh;
+	int ret = 0;
 
+	mutex_lock(&sbi->s_alloc_mutex);
 	for (i = 0; i < sbi->s_partitions; i++) {
 		struct udf_part_map *map = &sbi->s_partmaps[i];
 		if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
 					break;
 				}
 
-			if (!st)
-				return 1;
+			if (!st) {
+				ret = 1;
+				goto out;
+			}
 
 			reallocationTableLen =
 					le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
 						     ((old_block -
 							map->s_partition_root) &
 						     (sdata->s_packet_len - 1));
-					return 0;
+					ret = 0;
+					goto out;
 				} else if (origLoc == packet) {
 					*new_block = le32_to_cpu(
 							entry->mappedLocation) +
 						     ((old_block -
 							map->s_partition_root) &
 						     (sdata->s_packet_len - 1));
-					return 0;
+					ret = 0;
+					goto out;
 				} else if (origLoc > packet)
 					break;
 			}
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
 					      st->mapEntry[k].mappedLocation) +
 					((old_block - map->s_partition_root) &
 					 (sdata->s_packet_len - 1));
-				return 0;
+				ret = 0;
+				goto out;
 			}
 
-			return 1;
+			ret = 1;
+			goto out;
 		} /* if old_block */
 	}
 
 	if (i == sbi->s_partitions) {
 		/* outside of partitions */
 		/* for now, fail =) */
-		return 1;
+		ret = 1;
 	}
 
-	return 0;
+out:
+	mutex_unlock(&sbi->s_alloc_mutex);
+	return ret;
 }
 
 static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b539d53320f..7b27b063ff6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
@@ -135,6 +134,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
 	ei->i_next_alloc_block = 0;
 	ei->i_next_alloc_goal = 0;
 	ei->i_strat4096 = 0;
+	init_rwsem(&ei->i_data_sem);
 
 	return &ei->vfs_inode;
 }
@@ -574,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	if (!udf_parse_options(options, &uopt, true))
 		return -EINVAL;
 
-	lock_kernel();
+	write_lock(&sbi->s_cred_lock);
 	sbi->s_flags = uopt.flags;
 	sbi->s_uid   = uopt.uid;
 	sbi->s_gid   = uopt.gid;
 	sbi->s_umask = uopt.umask;
 	sbi->s_fmode = uopt.fmode;
 	sbi->s_dmode = uopt.dmode;
+	write_unlock(&sbi->s_cred_lock);
 
 	if (sbi->s_lvid_bh) {
 		int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -597,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 		udf_open_lvid(sb);
 
 out_unlock:
-	unlock_kernel();
 	return error;
 }
 
@@ -966,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 		(sizeof(struct buffer_head *) * nr_groups);
 
 	if (size <= PAGE_SIZE)
-		bitmap = kmalloc(size, GFP_KERNEL);
+		bitmap = kzalloc(size, GFP_KERNEL);
 	else
-		bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
+		bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
 
 	if (bitmap == NULL) {
 		udf_error(sb, __func__,
@@ -977,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 		return NULL;
 	}
 
-	memset(bitmap, 0x00, size);
 	bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
 	bitmap->s_nr_groups = nr_groups;
 	return bitmap;
@@ -1781,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
 
 	if (!bh)
 		return;
+
+	mutex_lock(&sbi->s_alloc_mutex);
 	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
 	lvidiu = udf_sb_lvidiu(sbi);
 
@@ -1797,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
+	mutex_unlock(&sbi->s_alloc_mutex);
 }
 
 static void udf_close_lvid(struct super_block *sb)
@@ -1809,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
 	if (!bh)
 		return;
 
+	mutex_lock(&sbi->s_alloc_mutex);
 	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
 	lvidiu = udf_sb_lvidiu(sbi);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1829,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
+	mutex_unlock(&sbi->s_alloc_mutex);
+}
+
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+	struct buffer_head *bh;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolHeaderDesc *lvhd;
+	u64 uniqueID;
+	u64 ret;
+
+	bh = sbi->s_lvid_bh;
+	if (!bh)
+		return 0;
+
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+	if (!(++uniqueID & 0xFFFFFFFF))
+		uniqueID += 16;
+	lvhd->uniqueID = cpu_to_le64(uniqueID);
+	mutex_unlock(&sbi->s_alloc_mutex);
+	mark_buffer_dirty(bh);
+
+	return ret;
 }
 
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1886,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	struct kernel_lb_addr rootdir, fileset;
 	struct udf_sb_info *sbi;
 
-	lock_kernel();
-
 	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
 	uopt.uid = -1;
 	uopt.gid = -1;
@@ -1896,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	uopt.dmode = UDF_INVALID_MODE;
 
 	sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-	if (!sbi) {
-		unlock_kernel();
+	if (!sbi)
 		return -ENOMEM;
-	}
 
 	sb->s_fs_info = sbi;
 
@@ -1936,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sbi->s_fmode = uopt.fmode;
 	sbi->s_dmode = uopt.dmode;
 	sbi->s_nls_map = uopt.nls_map;
+	rwlock_init(&sbi->s_cred_lock);
 
 	if (uopt.session == 0xFFFFFFFF)
 		sbi->s_session = udf_get_last_session(sb);
@@ -2045,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		goto error_out;
 	}
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	unlock_kernel();
 	return 0;
 
 error_out:
@@ -2066,7 +2093,6 @@ error_out:
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 
-	unlock_kernel();
 	return -EINVAL;
 }
 
@@ -2105,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
 
 	sbi = UDF_SB(sb);
 
-	lock_kernel();
-
 	if (sbi->s_vat_inode)
 		iput(sbi->s_vat_inode);
 	if (sbi->s_partitions)
@@ -2122,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
 	kfree(sbi->s_partmaps);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2186,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 	uint16_t ident;
 	struct spaceBitmapDesc *bm;
 
-	lock_kernel();
-
 	loc.logicalBlockNum = bitmap->s_extPosition;
 	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
 	bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2224,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 		}
 	}
 	brelse(bh);
-
 out:
-	unlock_kernel();
-
 	return accum;
 }
 
@@ -2240,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
 	int8_t etype;
 	struct extent_position epos;
 
-	lock_kernel();
-
+	mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
 	epos.block = UDF_I(table)->i_location;
 	epos.offset = sizeof(struct unallocSpaceEntry);
 	epos.bh = NULL;
@@ -2250,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
 		accum += (elen >> table->i_sb->s_blocksize_bits);
 
 	brelse(epos.bh);
-
-	unlock_kernel();
+	mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
 
 	return accum;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b..b1d4488b0f1 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
 
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 	int err = -EIO;
 	unsigned char *p = kmap(page);
 	struct udf_inode_info *iinfo;
+	uint32_t pos;
 
-	lock_kernel();
 	iinfo = UDF_I(inode);
+	pos = udf_block_map(inode, 0);
+
+	down_read(&iinfo->i_data_sem);
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
 		symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
 	} else {
-		bh = sb_bread(inode->i_sb, udf_block_map(inode, 0));
+		bh = sb_bread(inode->i_sb, pos);
 
 		if (!bh)
 			goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 	udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
 	brelse(bh);
 
-	unlock_kernel();
+	up_read(&iinfo->i_data_sem);
 	SetPageUptodate(page);
 	kunmap(page);
 	unlock_page(page);
 	return 0;
 
 out:
-	unlock_kernel();
+	up_read(&iinfo->i_data_sem);
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de4107..d1bd31ea724 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
 
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
+
 struct udf_inode_info {
 	struct timespec		i_crtime;
 	/* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
 		struct long_ad		*i_lad;
 		__u8		*i_data;
 	} i_ext;
+	struct rw_semaphore	i_data_sem;
 	struct inode vfs_inode;
 };
 
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c276..4858c191242 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
 #define __LINUX_UDF_SB_H
 
 #include <linux/mutex.h>
+#include <linux/bitops.h>
 
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC			0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
 	uid_t			s_uid;
 	mode_t			s_fmode;
 	mode_t			s_dmode;
+	/* Lock protecting consistency of above permission settings */
+	rwlock_t		s_cred_lock;
 
 	/* Root Info */
 	struct timespec		s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
 	__u16			s_udfrev;
 
 	/* Miscellaneous flags */
-	__u32			s_flags;
+	unsigned long		s_flags;
 
 	/* Encoding info */
 	struct nls_table	*s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
 
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
 
-#define UDF_QUERY_FLAG(X,Y)			( UDF_SB(X)->s_flags & ( 1 << (Y) ) )
-#define UDF_SET_FLAG(X,Y)			( UDF_SB(X)->s_flags |= ( 1 << (Y) ) )
-#define UDF_CLEAR_FLAG(X,Y)			( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) )
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
+{
+	return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+	set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+	clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
 
 #endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f430..eba48209f9f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
 };
 
 /* super.c */
+
+__attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
 	sb->s_dirt = 1;
 	UDF_SB(sb)->s_lvid_dirty = 1;
 }
+extern u64 lvid_get_unique_id(struct super_block *sb);
 
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6ca..faca4499709 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   kmem.o \
 				   xfs_aops.o \
 				   xfs_buf.o \
+				   xfs_discard.o \
 				   xfs_export.o \
 				   xfs_file.o \
 				   xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c37081..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
-	wait_queue_head_t waiters;
-} sv_t;
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue_exclusive(&sv->waiters, &wait);
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	spin_unlock(lock);
-
-	schedule();
-
-	remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define sv_init(sv,flag,name) \
-	init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-	/*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-	_sv_wait(sv, lock)
-#define sv_signal(sv) \
-	wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-	wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed..ec7bbb5645b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-	IO_READ,	/* mapping for a read */
-	IO_DELAY,	/* mapping covers delalloc region */
-	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
-	IO_NEW		/* just allocated */
-};
 
 /*
  * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 
-	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-	ASSERT(ioend->io_type != IO_READ);
-
 	if (unlikely(ioend->io_error))
 		return 0;
 
@@ -244,10 +232,8 @@ xfs_end_io(
 	 * We might have to update the on-disk file size after extending
 	 * writes.
 	 */
-	if (ioend->io_type != IO_READ) {
-		error = xfs_setfilesize(ioend);
-		ASSERT(!error || error == EAGAIN);
-	}
+	error = xfs_setfilesize(ioend);
+	ASSERT(!error || error == EAGAIN);
 
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
-	ssize_t			count,
 	struct xfs_bmbt_irec	*imap,
-	int			flags)
+	int			type,
+	int			nonblocking)
 {
-	int			nmaps = 1;
-	int			new = 0;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			count = 1 << inode->i_blkbits;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (type == IO_UNWRITTEN)
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		if (nonblocking)
+			return -XFS_ERROR(EAGAIN);
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+	}
 
-	return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       (ip->i_df.if_flags & XFS_IFEXTENTS));
+	ASSERT(offset <= mp->m_maxioffset);
+
+	if (offset + count > mp->m_maxioffset)
+		count = mp->m_maxioffset - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+			  bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (error)
+		return -XFS_ERROR(error);
+
+	if (type == IO_DELALLOC &&
+	    (!nimaps || isnullstartblock(imap->br_startblock))) {
+		error = xfs_iomap_write_allocate(ip, offset, count, imap);
+		if (!error)
+			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+		return -XFS_ERROR(error);
+	}
+
+#ifdef DEBUG
+	if (type == IO_UNWRITTEN) {
+		ASSERT(nimaps);
+		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+	}
+#endif
+	if (nimaps)
+		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+	return 0;
 }
 
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
 
 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
 		   WRITE_SYNC_PLUG : WRITE, bio);
-	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-	bio_put(bio);
 }
 
 STATIC struct bio *
 xfs_alloc_ioend_bio(
 	struct buffer_head	*bh)
 {
-	struct bio		*bio;
 	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
-
-	do {
-		bio = bio_alloc(GFP_NOIO, nvecs);
-		nvecs >>= 1;
-	} while (!bio);
+	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
 
 	ASSERT(bio->bi_private == NULL);
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
-	bio_get(bio);
 	return bio;
 }
 
@@ -470,9 +497,8 @@ xfs_submit_ioend(
 	/* Pass 1 - start writeback */
 	do {
 		next = ioend->io_list;
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
 			xfs_start_buffer_writeback(bh);
-		}
 	} while ((ioend = next) != NULL);
 
 	/* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
-	lock_buffer(bh);
 	xfs_map_buffer(inode, bh, imap, offset);
-	bh->b_bdev = xfs_find_bdev_for_inode(inode);
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 }
 
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-	struct page		*page,
-	unsigned int		pg_offset)
-{
-	struct buffer_head	*bh, *head;
-	int			ret = 0;
-
-	if (PageWriteback(page))
-		return 0;
-	if (!PageDirty(page))
-		return 0;
-	if (!page->mapping)
-		return 0;
-	if (!page_has_buffers(page))
-		return 0;
-
-	bh = head = page_buffers(page);
-	do {
-		if (!buffer_uptodate(bh))
-			break;
-		if (!buffer_mapped(bh))
-			break;
-		ret += bh->b_size;
-		if (ret >= pg_offset)
-			break;
-	} while ((bh = bh->b_this_page) != head);
-
-	return ret;
-}
-
-STATIC size_t
-xfs_probe_cluster(
-	struct inode		*inode,
-	struct page		*startpage,
-	struct buffer_head	*bh,
-	struct buffer_head	*head)
-{
-	struct pagevec		pvec;
-	pgoff_t			tindex, tlast, tloff;
-	size_t			total = 0;
-	int			done = 0, i;
-
-	/* First sum forwards in this page */
-	do {
-		if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-			return total;
-		total += bh->b_size;
-	} while ((bh = bh->b_this_page) != head);
-
-	/* if we reached the end of the page, sum forwards in following pages */
-	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-	tindex = startpage->index + 1;
-
-	/* Prune this back to avoid pathological behavior */
-	tloff = min(tlast, startpage->index + 64);
-
-	pagevec_init(&pvec, 0);
-	while (!done && tindex <= tloff) {
-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-			break;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			size_t pg_offset, pg_len = 0;
-
-			if (tindex == tlast) {
-				pg_offset =
-				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-				if (!pg_offset) {
-					done = 1;
-					break;
-				}
-			} else
-				pg_offset = PAGE_CACHE_SIZE;
-
-			if (page->index == tindex && trylock_page(page)) {
-				pg_len = xfs_probe_page(page, pg_offset);
-				unlock_page(page);
-			}
-
-			if (!pg_len) {
-				done = 1;
-				break;
-			}
-
-			total += pg_len;
-			tindex++;
-		}
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	return total;
-}
-
-/*
  * Test if a given page is suitable for writing as part of an unwritten
  * or delayed allocate extent.
  */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
 			if (buffer_unwritten(bh))
 				acceptable = (type == IO_UNWRITTEN);
 			else if (buffer_delay(bh))
-				acceptable = (type == IO_DELAY);
+				acceptable = (type == IO_DELALLOC);
 			else if (buffer_dirty(bh) && buffer_mapped(bh))
-				acceptable = (type == IO_NEW);
+				acceptable = (type == IO_OVERWRITE);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
 	loff_t			tindex,
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
-	struct writeback_control *wbc,
-	int			all_bh)
+	struct writeback_control *wbc)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
 			continue;
 		}
 
-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
+		if (buffer_unwritten(bh) || buffer_delay(bh) ||
+		    buffer_mapped(bh)) {
 			if (buffer_unwritten(bh))
 				type = IO_UNWRITTEN;
+			else if (buffer_delay(bh))
+				type = IO_DELALLOC;
 			else
-				type = IO_DELAY;
+				type = IO_OVERWRITE;
 
 			if (!xfs_imap_valid(inode, imap, offset)) {
 				done = 1;
 				continue;
 			}
 
-			ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-			ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-			xfs_map_at_offset(inode, bh, imap, offset);
+			lock_buffer(bh);
+			if (type != IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type,
 					 ioendp, done);
 
 			page_dirty--;
 			count++;
 		} else {
-			type = IO_NEW;
-			if (buffer_mapped(bh) && all_bh) {
-				lock_buffer(bh);
-				xfs_add_to_ioend(inode, bh, offset,
-						type, ioendp, done);
-				count++;
-				page_dirty--;
-			} else {
-				done = 1;
-			}
+			done = 1;
 		}
 	} while (offset += len, (bh = bh->b_this_page) != head);
 
@@ -876,7 +790,6 @@ xfs_cluster_write(
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
-	int			all_bh,
 	pgoff_t			tlast)
 {
 	struct pagevec		pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					imap, ioendp, wbc, all_bh);
+					imap, ioendp, wbc);
 			if (done)
 				break;
 		}
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
 	struct buffer_head	*bh, *head;
 	loff_t			offset = page_offset(page);
 
-	if (!xfs_is_delayed_page(page, IO_DELAY))
+	if (!xfs_is_delayed_page(page, IO_DELALLOC))
 		goto out_invalidate;
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
-	ssize_t			size, len;
-	int			flags, err, imap_valid = 0, uptodate = 1;
+	ssize_t			len;
+	int			err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
-	int			all_bh = 0;
+	int			nonblocking = 0;
 
 	trace_xfs_writepage(inode, page, 0);
 
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-	flags = BMAPI_READ;
-	type = IO_NEW;
+	type = IO_OVERWRITE;
+
+	if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+		nonblocking = 1;
 
 	do {
+		int new_ioend = 0;
+
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
 			continue;
 		}
 
-		if (imap_valid)
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
-
-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
-			int new_ioend = 0;
-
-			/*
-			 * Make sure we don't use a read-only iomap
-			 */
-			if (flags == BMAPI_READ)
-				imap_valid = 0;
-
-			if (buffer_unwritten(bh)) {
+		if (buffer_unwritten(bh)) {
+			if (type != IO_UNWRITTEN) {
 				type = IO_UNWRITTEN;
-				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
-			} else if (buffer_delay(bh)) {
-				type = IO_DELAY;
-				flags = BMAPI_ALLOCATE;
-
-				if (wbc->sync_mode == WB_SYNC_NONE)
-					flags |= BMAPI_TRYLOCK;
-			}
-
-			if (!imap_valid) {
-				/*
-				 * If we didn't have a valid mapping then we
-				 * need to ensure that we put the new mapping
-				 * in a new ioend structure. This needs to be
-				 * done to ensure that the ioends correctly
-				 * reflect the block mappings at io completion
-				 * for unwritten extent conversion.
-				 */
-				new_ioend = 1;
-				err = xfs_map_blocks(inode, offset, len,
-						&imap, flags);
-				if (err)
-					goto error;
-				imap_valid = xfs_imap_valid(inode, &imap,
-							    offset);
+				imap_valid = 0;
 			}
-			if (imap_valid) {
-				xfs_map_at_offset(inode, bh, &imap, offset);
-				xfs_add_to_ioend(inode, bh, offset, type,
-						 &ioend, new_ioend);
-				count++;
+		} else if (buffer_delay(bh)) {
+			if (type != IO_DELALLOC) {
+				type = IO_DELALLOC;
+				imap_valid = 0;
 			}
 		} else if (buffer_uptodate(bh)) {
-			/*
-			 * we got here because the buffer is already mapped.
-			 * That means it must already have extents allocated
-			 * underneath it. Map the extent by reading it.
-			 */
-			if (!imap_valid || flags != BMAPI_READ) {
-				flags = BMAPI_READ;
-				size = xfs_probe_cluster(inode, page, bh, head);
-				err = xfs_map_blocks(inode, offset, size,
-						&imap, flags);
-				if (err)
-					goto error;
-				imap_valid = xfs_imap_valid(inode, &imap,
-							    offset);
+			if (type != IO_OVERWRITE) {
+				type = IO_OVERWRITE;
+				imap_valid = 0;
 			}
+		} else {
+			if (PageUptodate(page)) {
+				ASSERT(buffer_mapped(bh));
+				imap_valid = 0;
+			}
+			continue;
+		}
 
+		if (imap_valid)
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		if (!imap_valid) {
 			/*
-			 * We set the type to IO_NEW in case we are doing a
-			 * small write at EOF that is extending the file but
-			 * without needing an allocation. We need to update the
-			 * file size on I/O completion in this case so it is
-			 * the same case as having just allocated a new extent
-			 * that we are writing into for the first time.
+			 * If we didn't have a valid mapping then we need to
+			 * put the new mapping into a separate ioend structure.
+			 * This ensures non-contiguous extents always have
+			 * separate ioends, which is particularly important
+			 * for unwritten extent conversion at I/O completion
+			 * time.
 			 */
-			type = IO_NEW;
-			if (trylock_buffer(bh)) {
-				if (imap_valid)
-					all_bh = 1;
-				xfs_add_to_ioend(inode, bh, offset, type,
-						&ioend, !imap_valid);
-				count++;
-			} else {
-				imap_valid = 0;
-			}
-		} else if (PageUptodate(page)) {
-			ASSERT(buffer_mapped(bh));
-			imap_valid = 0;
+			new_ioend = 1;
+			err = xfs_map_blocks(inode, offset, &imap, type,
+					     nonblocking);
+			if (err)
+				goto error;
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		}
+		if (imap_valid) {
+			lock_buffer(bh);
+			if (type != IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+					 new_ioend);
+			count++;
 		}
 
 		if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
 			end_index = last_index;
 
 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-					wbc, all_bh, end_index);
+				  wbc, end_index);
 	}
 
 	if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
 	int			create,
 	int			direct)
 {
-	int			flags = create ? BMAPI_WRITE : BMAPI_READ;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			lockmode = 0;
 	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
 	xfs_off_t		offset;
 	ssize_t			size;
-	int			nimap = 1;
 	int			new = 0;
-	int			error;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
 
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
 	if (!create && direct && offset >= i_size_read(inode))
 		return 0;
 
-	if (direct && create)
-		flags |= BMAPI_DIRECT;
+	if (create) {
+		lockmode = XFS_ILOCK_EXCL;
+		xfs_ilock(ip, lockmode);
+	} else {
+		lockmode = xfs_ilock_map_shared(ip);
+	}
+
+	ASSERT(offset <= mp->m_maxioffset);
+	if (offset + size > mp->m_maxioffset)
+		size = mp->m_maxioffset - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-	error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
-			  &new);
+	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+			  XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
 	if (error)
-		return -error;
-	if (nimap == 0)
-		return 0;
+		goto out_unlock;
+
+	if (create &&
+	    (!nimaps ||
+	     (imap.br_startblock == HOLESTARTBLOCK ||
+	      imap.br_startblock == DELAYSTARTBLOCK))) {
+		if (direct) {
+			error = xfs_iomap_write_direct(ip, offset, size,
+						       &imap, nimaps);
+		} else {
+			error = xfs_iomap_write_delay(ip, offset, size, &imap);
+		}
+		if (error)
+			goto out_unlock;
+
+		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+	} else if (nimaps) {
+		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+	} else {
+		trace_xfs_get_blocks_notfound(ip, offset, size);
+		goto out_unlock;
+	}
+	xfs_iunlock(ip, lockmode);
 
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
 	}
 
 	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return -error;
 }
 
 int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
 	ssize_t			ret;
 
 	if (rw & WRITE) {
-		iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+		iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
 
 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237..71f721e1a71 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+	IO_DIRECT = 0,	/* special case for direct I/O ioends */
+	IO_DELALLOC,	/* mapping covers delalloc region */
+	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
+	IO_OVERWRITE,	/* mapping covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+	{ 0,			"" }, \
+	{ IO_DELALLOC,		"delalloc" }, \
+	{ IO_UNWRITTEN,		"unwritten" }, \
+	{ IO_OVERWRITE,		"overwrite" }
+
+/*
  * xfs_ioend struct manages large extent writes for XFS.
  * It can manage several multi-page bio's at once.
  */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e3..ac1c7e8378d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-	.shrink = xfsbufd_wakeup,
-	.seeks = DEFAULT_SEEKS,
-};
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 
 /*
- *	Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
  */
+STATIC void
+xfs_buf_lru_add(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+
+	spin_lock(&btp->bt_lru_lock);
+	if (list_empty(&bp->b_lru)) {
+		atomic_inc(&bp->b_hold);
+		list_add_tail(&bp->b_lru, &btp->bt_lru);
+		btp->bt_lru_nr++;
+	}
+	spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+
+	if (list_empty(&bp->b_lru))
+		return;
+
+	spin_lock(&btp->bt_lru_lock);
+	if (!list_empty(&bp->b_lru)) {
+		list_del_init(&bp->b_lru);
+		btp->bt_lru_nr--;
+	}
+	spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+	struct xfs_buf	*bp)
+{
+	bp->b_flags |= XBF_STALE;
+	atomic_set(&(bp)->b_lru_ref, 0);
+	if (!list_empty(&bp->b_lru)) {
+		struct xfs_buftarg *btp = bp->b_target;
+
+		spin_lock(&btp->bt_lru_lock);
+		if (!list_empty(&bp->b_lru)) {
+			list_del_init(&bp->b_lru);
+			btp->bt_lru_nr--;
+			atomic_dec(&bp->b_hold);
+		}
+		spin_unlock(&btp->bt_lru_lock);
+	}
+	ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
 
 	memset(bp, 0, sizeof(xfs_buf_t));
 	atomic_set(&bp->b_hold, 1);
+	atomic_set(&bp->b_lru_ref, 1);
 	init_completion(&bp->b_iowait);
+	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_list);
 	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
 	trace_xfs_buf_free(bp, _RET_IP_);
 
+	ASSERT(list_empty(&bp->b_lru));
+
 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 		uint		i;
 
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
-			xfsbufd_wakeup(NULL, 0, gfp_mask);
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
@@ -827,7 +896,7 @@ xfs_buf_rele(
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
 	if (!pag) {
-		ASSERT(!bp->b_relse);
+		ASSERT(list_empty(&bp->b_lru));
 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold))
 			xfs_buf_free(bp);
@@ -835,13 +904,15 @@ xfs_buf_rele(
 	}
 
 	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-		if (bp->b_relse) {
-			atomic_inc(&bp->b_hold);
+		if (!(bp->b_flags & XBF_STALE) &&
+			   atomic_read(&bp->b_lru_ref)) {
+			xfs_buf_lru_add(bp);
 			spin_unlock(&pag->pag_buf_lock);
-			bp->b_relse(bp);
 		} else {
+			xfs_buf_lru_del(bp);
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 			spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1509,84 @@ xfs_buf_iomove(
  */
 
 /*
- *	Wait for any bufs with callbacks that have been submitted but
- *	have not yet returned... walk the hash list for the target.
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
  */
 void
 xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct xfs_perag	*pag;
-	uint			i;
+	struct xfs_buf		*bp;
 
-	for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
-		pag = xfs_perag_get(btp->bt_mount, i);
-		spin_lock(&pag->pag_buf_lock);
-		while (rb_first(&pag->pag_buf_tree)) {
-			spin_unlock(&pag->pag_buf_lock);
+restart:
+	spin_lock(&btp->bt_lru_lock);
+	while (!list_empty(&btp->bt_lru)) {
+		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+		if (atomic_read(&bp->b_hold) > 1) {
+			spin_unlock(&btp->bt_lru_lock);
 			delay(100);
-			spin_lock(&pag->pag_buf_lock);
+			goto restart;
 		}
-		spin_unlock(&pag->pag_buf_lock);
-		xfs_perag_put(pag);
+		/*
+		 * clear the LRU reference count so the bufer doesn't get
+		 * ignored in xfs_buf_rele().
+		 */
+		atomic_set(&bp->b_lru_ref, 0);
+		spin_unlock(&btp->bt_lru_lock);
+		xfs_buf_rele(bp);
+		spin_lock(&btp->bt_lru_lock);
 	}
+	spin_unlock(&btp->bt_lru_lock);
 }
 
-/*
- *	buftarg list for delwrite queue processing
- */
-static LIST_HEAD(xfs_buftarg_list);
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
-
-STATIC void
-xfs_register_buftarg(
-	xfs_buftarg_t           *btp)
+int
+xfs_buftarg_shrink(
+	struct shrinker		*shrink,
+	int			nr_to_scan,
+	gfp_t			mask)
 {
-	spin_lock(&xfs_buftarg_lock);
-	list_add(&btp->bt_list, &xfs_buftarg_list);
-	spin_unlock(&xfs_buftarg_lock);
-}
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	struct xfs_buf		*bp;
+	LIST_HEAD(dispose);
 
-STATIC void
-xfs_unregister_buftarg(
-	xfs_buftarg_t           *btp)
-{
-	spin_lock(&xfs_buftarg_lock);
-	list_del(&btp->bt_list);
-	spin_unlock(&xfs_buftarg_lock);
+	if (!nr_to_scan)
+		return btp->bt_lru_nr;
+
+	spin_lock(&btp->bt_lru_lock);
+	while (!list_empty(&btp->bt_lru)) {
+		if (nr_to_scan-- <= 0)
+			break;
+
+		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+
+		/*
+		 * Decrement the b_lru_ref count unless the value is already
+		 * zero. If the value is already zero, we need to reclaim the
+		 * buffer, otherwise it gets another trip through the LRU.
+		 */
+		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+			list_move_tail(&bp->b_lru, &btp->bt_lru);
+			continue;
+		}
+
+		/*
+		 * remove the buffer from the LRU now to avoid needing another
+		 * lock round trip inside xfs_buf_rele().
+		 */
+		list_move(&bp->b_lru, &dispose);
+		btp->bt_lru_nr--;
+	}
+	spin_unlock(&btp->bt_lru_lock);
+
+	while (!list_empty(&dispose)) {
+		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+		list_del_init(&bp->b_lru);
+		xfs_buf_rele(bp);
+	}
+
+	return btp->bt_lru_nr;
 }
 
 void
@@ -1490,17 +1594,14 @@ xfs_free_buftarg(
 	struct xfs_mount	*mp,
 	struct xfs_buftarg	*btp)
 {
+	unregister_shrinker(&btp->bt_shrinker);
+
 	xfs_flush_buftarg(btp, 1);
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
 	iput(btp->bt_mapping->host);
 
-	/* Unregister the buftarg first so that we don't get a
-	 * wakeup finding a non-existent task
-	 */
-	xfs_unregister_buftarg(btp);
 	kthread_stop(btp->bt_task);
-
 	kmem_free(btp);
 }
 
@@ -1597,20 +1698,13 @@ xfs_alloc_delwrite_queue(
 	xfs_buftarg_t		*btp,
 	const char		*fsname)
 {
-	int	error = 0;
-
-	INIT_LIST_HEAD(&btp->bt_list);
 	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
 	spin_lock_init(&btp->bt_delwrite_lock);
 	btp->bt_flags = 0;
 	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-	if (IS_ERR(btp->bt_task)) {
-		error = PTR_ERR(btp->bt_task);
-		goto out_error;
-	}
-	xfs_register_buftarg(btp);
-out_error:
-	return error;
+	if (IS_ERR(btp->bt_task))
+		return PTR_ERR(btp->bt_task);
+	return 0;
 }
 
 xfs_buftarg_t *
@@ -1627,12 +1721,17 @@ xfs_alloc_buftarg(
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
+	INIT_LIST_HEAD(&btp->bt_lru);
+	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
 	if (xfs_mapping_buftarg(btp, bdev))
 		goto error;
 	if (xfs_alloc_delwrite_queue(btp, fsname))
 		goto error;
+	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	register_shrinker(&btp->bt_shrinker);
 	return btp;
 
 error:
@@ -1737,27 +1836,6 @@ xfs_buf_runall_queues(
 	flush_workqueue(queue);
 }
 
-STATIC int
-xfsbufd_wakeup(
-	struct shrinker		*shrink,
-	int			priority,
-	gfp_t			mask)
-{
-	xfs_buftarg_t		*btp;
-
-	spin_lock(&xfs_buftarg_lock);
-	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-			continue;
-		if (list_empty(&btp->bt_delwrite_queue))
-			continue;
-		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-		wake_up_process(btp->bt_task);
-	}
-	spin_unlock(&xfs_buftarg_lock);
-	return 0;
-}
-
 /*
  * Move as many buffers as specified to the supplied list
  * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2030,6 @@ xfs_buf_init(void)
 	if (!xfsconvertd_workqueue)
 		goto out_destroy_xfsdatad_workqueue;
 
-	register_shrinker(&xfs_buf_shake);
 	return 0;
 
  out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2045,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-	unregister_shrinker(&xfs_buf_shake);
 	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf9..cbe65950e52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
 
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
-	struct list_head	bt_list;
 	struct list_head	bt_delwrite_queue;
 	spinlock_t		bt_delwrite_lock;
 	unsigned long		bt_flags;
+
+	/* LRU control structures */
+	struct shrinker		bt_shrinker;
+	struct list_head	bt_lru;
+	spinlock_t		bt_lru_lock;
+	unsigned int		bt_lru_nr;
 } xfs_buftarg_t;
 
 /*
@@ -147,8 +152,6 @@ typedef struct xfs_buftarg {
 
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 
 #define XB_PAGES	2
 
@@ -164,9 +167,11 @@ typedef struct xfs_buf {
 	xfs_off_t		b_file_offset;	/* offset in file */
 	size_t			b_buffer_length;/* size of buffer in bytes */
 	atomic_t		b_hold;		/* reference count */
+	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
 
+	struct list_head	b_lru;		/* lru list */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
@@ -176,7 +181,6 @@ typedef struct xfs_buf {
 	void			*b_addr;	/* virtual address of buffer */
 	struct work_struct	b_iodone_work;
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
-	xfs_buf_relse_t		b_relse;	/* releasing function */
 	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
@@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
 		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
 
-#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)	xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp)	do {				\
@@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_FSPRIVATE2(bp, type)		((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)		((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)			do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)	((bp)->b_relse = (func))
 
 #define XFS_BUF_PTR(bp)			(xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)	xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
 
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	do { } while (0)
+static inline void
+xfs_buf_set_ref(
+	struct xfs_buf	*bp,
+	int		lru_ref)
+{
+	atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
 
 #define XFS_BUF_ISPINNED(bp)	atomic_read(&((bp)->b_pin_count))
 
@@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void);
 
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-	if (!bp->b_relse)
-		xfs_buf_unlock(bp);
+	xfs_buf_unlock(bp);
 	xfs_buf_rele(bp);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 00000000000..05201ae719e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_fsblock_t		start,
+	xfs_fsblock_t		len,
+	xfs_fsblock_t		minlen,
+	__uint64_t		*blocks_trimmed)
+{
+	struct block_device	*bdev = mp->m_ddev_targp->bt_bdev;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	struct xfs_perag	*pag;
+	int			error;
+	int			i;
+
+	pag = xfs_perag_get(mp, agno);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error || !agbp)
+		goto out_put_perag;
+
+	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+	/*
+	 * Force out the log.  This means any transactions that might have freed
+	 * space before we took the AGF buffer lock are now on disk, and the
+	 * volatile disk cache is flushed.
+	 */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Look up the longest btree in the AGF and start with it.
+	 */
+	error = xfs_alloc_lookup_le(cur, 0,
+				    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+	if (error)
+		goto out_del_cursor;
+
+	/*
+	 * Loop until we are done with all extents that are large
+	 * enough to be worth discarding.
+	 */
+	while (i) {
+		xfs_agblock_t fbno;
+		xfs_extlen_t flen;
+
+		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+		if (error)
+			goto out_del_cursor;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+		ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+
+		/*
+		 * Too small?  Give up.
+		 */
+		if (flen < minlen) {
+			trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+			goto out_del_cursor;
+		}
+
+		/*
+		 * If the extent is entirely outside of the range we are
+		 * supposed to discard skip it.  Do not bother to trim
+		 * down partially overlapping ranges for now.
+		 */
+		if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+		    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+			trace_xfs_discard_exclude(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		/*
+		 * If any blocks in the range are still busy, skip the
+		 * discard and try again the next time.
+		 */
+		if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+			trace_xfs_discard_busy(mp, agno, fbno, flen);
+			goto next_extent;
+		}
+
+		trace_xfs_discard_extent(mp, agno, fbno, flen);
+		error = -blkdev_issue_discard(bdev,
+				XFS_AGB_TO_DADDR(mp, agno, fbno),
+				XFS_FSB_TO_BB(mp, flen),
+				GFP_NOFS, 0);
+		if (error)
+			goto out_del_cursor;
+		*blocks_trimmed += flen;
+
+next_extent:
+		error = xfs_btree_decrement(cur, 0, &i);
+		if (error)
+			goto out_del_cursor;
+	}
+
+out_del_cursor:
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+out_put_perag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+int
+xfs_ioc_trim(
+	struct xfs_mount		*mp,
+	struct fstrim_range __user	*urange)
+{
+	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+	unsigned int		granularity = q->limits.discard_granularity;
+	struct fstrim_range	range;
+	xfs_fsblock_t		start, len, minlen;
+	xfs_agnumber_t		start_agno, end_agno, agno;
+	__uint64_t		blocks_trimmed = 0;
+	int			error, last_error = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+	if (copy_from_user(&range, urange, sizeof(range)))
+		return -XFS_ERROR(EFAULT);
+
+	/*
+	 * Truncating down the len isn't actually quite correct, but using
+	 * XFS_B_TO_FSB would mean we trivially get overflows for values
+	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+	 * used by the fstrim application.  In the end it really doesn't
+	 * matter as trimming blocks is an advisory interface.
+	 */
+	start = XFS_B_TO_FSBT(mp, range.start);
+	len = XFS_B_TO_FSBT(mp, range.len);
+	minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+	start_agno = XFS_FSB_TO_AGNO(mp, start);
+	if (start_agno >= mp->m_sb.sb_agcount)
+		return -XFS_ERROR(EINVAL);
+
+	end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+	if (end_agno >= mp->m_sb.sb_agcount)
+		end_agno = mp->m_sb.sb_agcount - 1;
+
+	for (agno = start_agno; agno <= end_agno; agno++) {
+		error = -xfs_trim_extents(mp, agno, start, len, minlen,
+					  &blocks_trimmed);
+		if (error)
+			last_error = error;
+	}
+
+	if (last_error)
+		return last_error;
+
+	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+	if (copy_to_user(urange, &range, sizeof(range)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 00000000000..e82b6dd3e12
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+
+extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790e..fc0114da7fd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
 	else
 		fileid_type = FILEID_INO32_GEN_PARENT;
 
-	/* filesystem may contain 64bit inode numbers */
-	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+	/*
+	 * If the the filesystem may contain 64bit inode numbers, we need
+	 * to use larger file handles that can represent them.
+	 *
+	 * While we only allocate inodes that do not fit into 32 bits any
+	 * large enough filesystem may contain them, thus the slightly
+	 * confusing looking conditional below.
+	 */
+	if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+	    (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
 		fileid_type |= XFS_FILEID_TYPE_64FLAG;
 
 	/*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a16..a55c1b46b21 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
 #include "xfs_trace.h"
 
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_lock(&VFS_I(ip)->i_mutex);
+	xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_iunlock(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+	struct xfs_inode	*ip,
+	int			type)
+{
+	xfs_ilock_demote(ip, type);
+	if (type & XFS_IOLOCK_EXCL)
+		mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
  *	xfs_iozero
  *
  *	xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (unlikely(ioflags & IO_ISDIRECT))
-		mutex_lock(&inode->i_mutex);
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
 	if (unlikely(ioflags & IO_ISDIRECT)) {
+		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
 		if (inode->i_mapping->nrpages) {
 			ret = -xfs_flushinval_pages(ip,
 					(iocb->ki_pos & PAGE_CACHE_MASK),
 					-1, FI_REMAPF_LOCKED);
+			if (ret) {
+				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+				return ret;
+			}
 		}
-		mutex_unlock(&inode->i_mutex);
-		if (ret) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return ret;
-		}
-	}
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+	} else
+		xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 
 	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 
@@ -285,7 +319,7 @@ xfs_file_aio_read(
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 
@@ -309,7 +343,7 @@ xfs_file_splice_read(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 
 	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
@@ -317,10 +351,61 @@ xfs_file_splice_read(
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 
+STATIC void
+xfs_aio_write_isize_update(
+	struct inode	*inode,
+	loff_t		*ppos,
+	ssize_t		bytes_written)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fsize_t		isize = i_size_read(inode);
+
+	if (bytes_written > 0)
+		XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+	if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+					*ppos > isize))
+		*ppos = isize;
+
+	if (*ppos > ip->i_size) {
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_new_size) {
+		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+		ip->i_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
 {
 	struct inode		*inode = outfilp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fsize_t		isize, new_size;
+	xfs_fsize_t		new_size;
 	int			ioflags = 0;
 	ssize_t			ret;
 
@@ -355,27 +440,9 @@ xfs_file_splice_write(
 	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_write_bytes, ret);
-
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-		*ppos = isize;
-
-	if (*ppos > ip->i_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_size)
-			ip->i_size = *ppos;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
 
-	if (ip->i_new_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
+	xfs_aio_write_isize_update(inode, ppos, ret);
+	xfs_aio_write_newsize_update(ip);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -562,247 +629,314 @@ out_lock:
 	return error;
 }
 
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
-	struct kiocb		*iocb,
-	const struct iovec	*iovp,
-	unsigned long		nr_segs,
-	loff_t			pos)
+xfs_file_aio_write_checks(
+	struct file		*file,
+	loff_t			*pos,
+	size_t			*count,
+	int			*iolock)
 {
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
+	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0, error = 0;
-	int			ioflags = 0;
-	xfs_fsize_t		isize, new_size;
-	int			iolock;
-	size_t			ocount = 0, count;
-	int			need_i_mutex;
+	xfs_fsize_t		new_size;
+	int			error = 0;
 
-	XFS_STATS_INC(xs_write_calls);
+	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+	if (error) {
+		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+		*iolock = 0;
+		return error;
+	}
 
-	BUG_ON(iocb->ki_pos != pos);
+	new_size = *pos + *count;
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
 
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	if (file->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
+	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+		file_update_time(file);
+
+	/*
+	 * If the offset is beyond the size of the file, we need to zero any
+	 * blocks that fall between the existing EOF and the start of this
+	 * write.
+	 */
+	if (*pos > ip->i_size)
+		error = -xfs_zero_eof(ip, *pos, ip->i_size);
 
-	error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
 
-	count = ocount;
-	if (count == 0)
-		return 0;
-
-	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+	/*
+	 * If we're writing the file then make sure to clear the setuid and
+	 * setgid bits if the process is not being run by root.  This keeps
+	 * people from modifying setuid and setgid binaries.
+	 */
+	return file_remove_suid(file);
 
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
+}
 
-relock:
-	if (ioflags & IO_ISDIRECT) {
-		iolock = XFS_IOLOCK_SHARED;
-		need_i_mutex = 0;
-	} else {
-		iolock = XFS_IOLOCK_EXCL;
-		need_i_mutex = 1;
-		mutex_lock(&inode->i_mutex);
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos,
+	size_t			ocount,
+	int			*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	size_t			count = ocount;
+	int			unaligned_io = 0;
+	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp;
+
+	*iolock = 0;
+	if ((pos & target->bt_smask) || (count & target->bt_smask))
+		return -XFS_ERROR(EINVAL);
+
+	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+		unaligned_io = 1;
+
+	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+		*iolock = XFS_IOLOCK_EXCL;
+	else
+		*iolock = XFS_IOLOCK_SHARED;
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	if (ret)
+		return ret;
+
+	if (mapping->nrpages) {
+		WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+							FI_REMAPF_LOCKED);
+		if (ret)
+			return ret;
 	}
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-
-start:
-	error = -generic_write_checks(file, &pos, &count,
-					S_ISBLK(inode->i_mode));
-	if (error) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-		goto out_unlock_mutex;
+	/*
+	 * If we are doing unaligned IO, wait for all other IO to drain,
+	 * otherwise demote the lock if we had to flush cached pages
+	 */
+	if (unaligned_io)
+		xfs_ioend_wait(ip);
+	else if (*iolock == XFS_IOLOCK_EXCL) {
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		*iolock = XFS_IOLOCK_SHARED;
 	}
 
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+	ret = generic_file_direct_write(iocb, iovp,
+			&nr_segs, pos, &iocb->ki_pos, count, ocount);
 
-		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-			return XFS_ERROR(-EINVAL);
-		}
+	/* No fallback to buffered IO on errors for XFS. */
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
 
-		if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-			iolock = XFS_IOLOCK_EXCL;
-			need_i_mutex = 1;
-			mutex_lock(&inode->i_mutex);
-			xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-			goto start;
-		}
-	}
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos,
+	size_t			ocount,
+	int			*iolock)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	int			enospc = 0;
+	size_t			count = ocount;
 
-	new_size = pos + count;
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
+	*iolock = XFS_IOLOCK_EXCL;
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	if (likely(!(ioflags & IO_INVIS)))
-		file_update_time(file);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	if (ret)
+		return ret;
 
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+			pos, &iocb->ki_pos, count, ret);
 	/*
-	 * If the offset is beyond the size of the file, we have a couple
-	 * of things to do. First, if there is already space allocated
-	 * we need to either create holes or zero the disk or ...
-	 *
-	 * If there is a page where the previous size lands, we need
-	 * to zero it out up to the new size.
+	 * if we just got an ENOSPC, flush the inode now we aren't holding any
+	 * page locks and retry *once*
 	 */
-
-	if (pos > ip->i_size) {
-		error = xfs_zero_eof(ip, pos, ip->i_size);
-		if (error) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			goto out_unlock_internal;
-		}
+	if (ret == -ENOSPC && !enospc) {
+		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+		if (ret)
+			return ret;
+		enospc = 1;
+		goto write_retry;
 	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	current->backing_dev_info = NULL;
+	return ret;
+}
 
-	/*
-	 * If we're writing the file then make sure to clear the
-	 * setuid and setgid bits if the process is not being run
-	 * by root.  This keeps people from modifying setuid and
-	 * setgid binaries.
-	 */
-	error = -file_remove_suid(file);
-	if (unlikely(error))
-		goto out_unlock_internal;
+STATIC ssize_t
+xfs_file_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	ssize_t			ret;
+	int			iolock;
+	size_t			ocount = 0;
 
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	XFS_STATS_INC(xs_write_calls);
 
-	if ((ioflags & IO_ISDIRECT)) {
-		if (mapping->nrpages) {
-			WARN_ON(need_i_mutex == 0);
-			error = xfs_flushinval_pages(ip,
-					(pos & PAGE_CACHE_MASK),
-					-1, FI_REMAPF_LOCKED);
-			if (error)
-				goto out_unlock_internal;
-		}
+	BUG_ON(iocb->ki_pos != pos);
 
-		if (need_i_mutex) {
-			/* demote the lock now the cached pages are gone */
-			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-			mutex_unlock(&inode->i_mutex);
+	ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+	if (ret)
+		return ret;
 
-			iolock = XFS_IOLOCK_SHARED;
-			need_i_mutex = 0;
-		}
+	if (ocount == 0)
+		return 0;
 
-		trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
-		ret = generic_file_direct_write(iocb, iovp,
-				&nr_segs, pos, &iocb->ki_pos, count, ocount);
+	xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
 
-		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
-		 */
-		if (ret >= 0 && ret != count) {
-			XFS_STATS_ADD(xs_write_bytes, ret);
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
 
-			pos += ret;
-			count -= ret;
+	if (unlikely(file->f_flags & O_DIRECT))
+		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
+	else
+		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+						ocount, &iolock);
 
-			ioflags &= ~IO_ISDIRECT;
-			xfs_iunlock(ip, iolock);
-			goto relock;
-		}
-	} else {
-		int enospc = 0;
-		ssize_t ret2 = 0;
+	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
-write_retry:
-		trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
-		ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-				pos, &iocb->ki_pos, count, ret);
-		/*
-		 * if we just got an ENOSPC, flush the inode now we
-		 * aren't holding any page locks and retry *once*
-		 */
-		if (ret2 == -ENOSPC && !enospc) {
-			error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-			if (error)
-				goto out_unlock_internal;
-			enospc = 1;
-			goto write_retry;
-		}
-		ret = ret2;
-	}
+	if (ret <= 0)
+		goto out_unlock;
 
-	current->backing_dev_info = NULL;
+	/* Handle various SYNC-type writes */
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
+		int error, error2;
 
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
-		iocb->ki_pos = isize;
+		xfs_rw_iunlock(ip, iolock);
+		error = filemap_write_and_wait_range(mapping, pos, end);
+		xfs_rw_ilock(ip, iolock);
 
-	if (iocb->ki_pos > ip->i_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (iocb->ki_pos > ip->i_size)
-			ip->i_size = iocb->ki_pos;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		error2 = -xfs_file_fsync(file,
+					 (file->f_flags & __O_SYNC) ? 0 : 1);
+		if (error)
+			ret = error;
+		else if (error2)
+			ret = error2;
 	}
 
-	error = -ret;
-	if (ret <= 0)
-		goto out_unlock_internal;
+out_unlock:
+	xfs_aio_write_newsize_update(ip);
+	xfs_rw_iunlock(ip, iolock);
+	return ret;
+}
 
-	XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+	struct file	*file,
+	int		mode,
+	loff_t		offset,
+	loff_t		len)
+{
+	struct inode	*inode = file->f_path.dentry->d_inode;
+	long		error;
+	loff_t		new_size = 0;
+	xfs_flock64_t	bf;
+	xfs_inode_t	*ip = XFS_I(inode);
+	int		cmd = XFS_IOC_RESVSP;
 
-	/* Handle various SYNC-type writes */
-	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-		loff_t end = pos + ret - 1;
-		int error2;
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
 
-		xfs_iunlock(ip, iolock);
-		if (need_i_mutex)
-			mutex_unlock(&inode->i_mutex);
+	bf.l_whence = 0;
+	bf.l_start = offset;
+	bf.l_len = len;
 
-		error2 = filemap_write_and_wait_range(mapping, pos, end);
-		if (!error)
-			error = error2;
-		if (need_i_mutex)
-			mutex_lock(&inode->i_mutex);
-		xfs_ilock(ip, iolock);
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-		error2 = -xfs_file_fsync(file,
-					 (file->f_flags & __O_SYNC) ? 0 : 1);
-		if (!error)
-			error = error2;
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		cmd = XFS_IOC_UNRESVSP;
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
 	}
 
- out_unlock_internal:
-	if (ip->i_new_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
-		/*
-		 * If this was a direct or synchronous I/O that failed (such
-		 * as ENOSPC) then part of the I/O may have been written to
-		 * disk before the error occured.  In this case the on-disk
-		 * file size may have been adjusted beyond the in-memory file
-		 * size and now needs to be truncated back.
-		 */
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+	if (error)
+		goto out_unlock;
+
+	/* Change file size if needed */
+	if (new_size) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
 	}
-	xfs_iunlock(ip, iolock);
- out_unlock_mutex:
-	if (need_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	return -error;
+
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
 }
 
+
 STATIC int
 xfs_file_open(
 	struct inode	*inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
+	.fallocate	= xfs_file_fallocate,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392..b06ede1d0be 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
 	trace_xfs_file_ioctl(ip);
 
 	switch (cmd) {
+	case FITRIM:
+		return xfs_ioc_trim(mp, arg);
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a297..bd5727852fd 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
 
@@ -505,58 +504,6 @@ xfs_vn_setattr(
 	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
-STATIC long
-xfs_vn_fallocate(
-	struct inode	*inode,
-	int		mode,
-	loff_t		offset,
-	loff_t		len)
-{
-	long		error;
-	loff_t		new_size = 0;
-	xfs_flock64_t	bf;
-	xfs_inode_t	*ip = XFS_I(inode);
-
-	/* preallocation on directories not yet supported */
-	error = -ENODEV;
-	if (S_ISDIR(inode->i_mode))
-		goto out_error;
-
-	bf.l_whence = 0;
-	bf.l_start = offset;
-	bf.l_len = len;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
-		new_size = offset + len;
-		error = inode_newsize_ok(inode, new_size);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-				       0, XFS_ATTR_NOLOCK);
-	if (error)
-		goto out_unlock;
-
-	/* Change file size if needed */
-	if (new_size) {
-		struct iattr iattr;
-
-		iattr.ia_valid = ATTR_SIZE;
-		iattr.ia_size = new_size;
-		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-	}
-
-out_unlock:
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-	return error;
-}
-
 #define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
 /*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
 	.getxattr		= generic_getxattr,
 	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.fallocate		= xfs_vn_fallocate,
 	.fiemap			= xfs_vn_fiemap,
 };
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff7..09649499774 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 
 #include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3..9731898083a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -606,7 +606,8 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -620,7 +621,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 /*
@@ -834,8 +835,11 @@ xfsaild_wakeup(
 	struct xfs_ail		*ailp,
 	xfs_lsn_t		threshold_lsn)
 {
-	ailp->xa_target = threshold_lsn;
-	wake_up_process(ailp->xa_task);
+	/* only ever move the target forwards */
+	if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+		ailp->xa_target = threshold_lsn;
+		wake_up_process(ailp->xa_task);
+	}
 }
 
 STATIC int
@@ -847,8 +851,17 @@ xfsaild(
 	long		tout = 0; /* milliseconds */
 
 	while (!kthread_should_stop()) {
-		schedule_timeout_interruptible(tout ?
-				msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+		/*
+		 * for short sleeps indicating congestion, don't allow us to
+		 * get woken early. Otherwise all we do is bang on the AIL lock
+		 * without making progress.
+		 */
+		if (tout && tout <= 20)
+			__set_current_state(TASK_KILLABLE);
+		else
+			__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(tout ?
+				 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
 
 		/* swsusp */
 		try_to_freeze();
@@ -935,7 +948,7 @@ out_reclaim:
  * Slab object creation initialisation for the XFS inode.
  * This covers only the idempotent fields in the XFS inode;
  * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
  * fields in the xfs inode that left in the initialise state
  * when freeing the inode.
  */
@@ -1118,6 +1131,8 @@ xfs_fs_evict_inode(
 	 */
 	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+			&xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
 
 	xfs_inactive(ip);
 }
@@ -1399,7 +1414,7 @@ xfs_fs_freeze(
 
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
-	return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+	return -xfs_fs_log_dummy(mp);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1..e22f0057d21 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
 	struct inode		*inode = VFS_I(ip);
 
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
 	/* nothing to sync during shutdown */
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return EFSCORRUPTED;
 
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		return ENOENT;
-
 	/* If we can't grab the inode, it must on it's way to reclaim. */
 	if (!igrab(inode))
 		return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
 
 	/* inode is valid */
 	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ENOENT;
 }
 
 STATIC int
@@ -98,12 +118,12 @@ restart:
 		int		error = 0;
 		int		i;
 
-		read_lock(&pag->pag_ici_lock);
+		rcu_read_lock();
 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH);
 		if (!nr_found) {
-			read_unlock(&pag->pag_ici_lock);
+			rcu_read_unlock();
 			break;
 		}
 
@@ -118,18 +138,26 @@ restart:
 				batch[i] = NULL;
 
 			/*
-			 * Update the index for the next lookup. Catch overflows
-			 * into the next AG range which can occur if we have inodes
-			 * in the last block of the AG and we are currently
-			 * pointing to the last inode.
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
 			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 				done = 1;
 		}
 
 		/* unlock now we've grabbed the inodes. */
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 
 		for (i = 0; i < nr_found; i++) {
 			if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
 
 	/* mark the log as covered if needed */
 	if (xfs_log_need_covered(mp))
-		error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+		error2 = xfs_fs_log_dummy(mp);
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		xfs_log_force(mp, 0);
-		xfs_reclaim_inodes(mp, 0);
 		/* dgc: errors ignored here */
-		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
 		if (mp->m_super->s_frozen == SB_UNFROZEN &&
 		    xfs_log_need_covered(mp))
-			error = xfs_fs_log_dummy(mp, 0);
+			error = xfs_fs_log_dummy(mp);
+		else
+			xfs_log_force(mp, 0);
+		xfs_reclaim_inodes(mp, 0);
+		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
 	struct xfs_perag *pag;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	write_lock(&pag->pag_ici_lock);
+	spin_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
 	__xfs_inode_set_reclaim_tag(pag, ip);
 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 	spin_unlock(&ip->i_flags_lock);
-	write_unlock(&pag->pag_ici_lock);
+	spin_unlock(&pag->pag_ici_lock);
 	xfs_perag_put(pag);
 }
 
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
 	struct xfs_inode	*ip,
 	int			flags)
 {
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
 
 	/*
-	 * do some unlocked checks first to avoid unnecceary lock traffic.
+	 * do some unlocked checks first to avoid unnecessary lock traffic.
 	 * The first is a flush lock check, the second is a already in reclaim
 	 * check. Only do these checks if we are not going to block on locks.
 	 */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
 	 * The radix tree lock here protects a thread in xfs_iget from racing
 	 * with us starting reclaim on the inode.  Once we have the
 	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
 	 */
 	spin_lock(&ip->i_flags_lock);
-	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* ignore as it is already under reclaim */
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
 		spin_unlock(&ip->i_flags_lock);
 		return 1;
 	}
@@ -795,12 +834,12 @@ reclaim:
 	 * added to the tree assert that it's been there before to catch
 	 * problems with the inode life time early on.
 	 */
-	write_lock(&pag->pag_ici_lock);
+	spin_lock(&pag->pag_ici_lock);
 	if (!radix_tree_delete(&pag->pag_ici_root,
 				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
 		ASSERT(0);
 	__xfs_inode_clear_reclaim(pag, ip);
-	write_unlock(&pag->pag_ici_lock);
+	spin_unlock(&pag->pag_ici_lock);
 
 	/*
 	 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +903,14 @@ restart:
 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 			int	i;
 
-			write_lock(&pag->pag_ici_lock);
+			rcu_read_lock();
 			nr_found = radix_tree_gang_lookup_tag(
 					&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH,
 					XFS_ICI_RECLAIM_TAG);
 			if (!nr_found) {
-				write_unlock(&pag->pag_ici_lock);
+				rcu_read_unlock();
 				break;
 			}
 
@@ -891,14 +930,24 @@ restart:
 				 * occur if we have inodes in the last block of
 				 * the AG and we are currently pointing to the
 				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
 				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
 				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 					done = 1;
 			}
 
 			/* unlock now we've grabbed the inodes. */
-			write_unlock(&pag->pag_ici_lock);
+			rcu_read_unlock();
 
 			for (i = 0; i < nr_found; i++) {
 				if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae..ee3cee097e7 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 
 static struct ctl_table_header *xfs_table_header;
 
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
 
 	return ret;
 }
+
+STATIC int
+xfs_panic_mask_proc_handler(
+	ctl_table	*ctl,
+	int		write,
+	void		__user *buffer,
+	size_t		*lenp,
+	loff_t		*ppos)
+{
+	int		ret, *valp = ctl->data;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		xfs_panic_mask = *valp;
+#ifdef DEBUG
+		xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+	}
+	return ret;
+}
 #endif /* CONFIG_PROC_FS */
 
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
 		.data		= &xfs_params.panic_mask.val,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= xfs_panic_mask_proc_handler,
 		.extra1		= &xfs_params.panic_mask.min,
 		.extra2		= &xfs_params.panic_mask.max
 	},
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c59..2d0bcb47907 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		__field(int, curr_res)
 		__field(int, unit_res)
 		__field(unsigned int, flags)
-		__field(void *, reserve_headq)
-		__field(void *, write_headq)
+		__field(int, reserveq)
+		__field(int, writeq)
 		__field(int, grant_reserve_cycle)
 		__field(int, grant_reserve_bytes)
 		__field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		__entry->curr_res = tic->t_curr_res;
 		__entry->unit_res = tic->t_unit_res;
 		__entry->flags = tic->t_flags;
-		__entry->reserve_headq = log->l_reserve_headq;
-		__entry->write_headq = log->l_write_headq;
-		__entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
-		__entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
-		__entry->grant_write_cycle = log->l_grant_write_cycle;
-		__entry->grant_write_bytes = log->l_grant_write_bytes;
+		__entry->reserveq = list_empty(&log->l_reserveq);
+		__entry->writeq = list_empty(&log->l_writeq);
+		xlog_crack_grant_head(&log->l_grant_reserve_head,
+				&__entry->grant_reserve_cycle,
+				&__entry->grant_reserve_bytes);
+		xlog_crack_grant_head(&log->l_grant_write_head,
+				&__entry->grant_write_cycle,
+				&__entry->grant_write_bytes);
 		__entry->curr_cycle = log->l_curr_cycle;
 		__entry->curr_block = log->l_curr_block;
-		__entry->tail_lsn = log->l_tail_lsn;
+		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
 	),
 	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-		  "t_unit_res %u t_flags %s reserve_headq 0x%p "
-		  "write_headq 0x%p grant_reserve_cycle %d "
+		  "t_unit_res %u t_flags %s reserveq %s "
+		  "writeq %s grant_reserve_cycle %d "
 		  "grant_reserve_bytes %d grant_write_cycle %d "
 		  "grant_write_bytes %d curr_cycle %d curr_block %d "
 		  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		  __entry->curr_res,
 		  __entry->unit_res,
 		  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-		  __entry->reserve_headq,
-		  __entry->write_headq,
+		  __entry->reserveq ? "empty" : "active",
+		  __entry->writeq ? "empty" : "active",
 		  __entry->grant_reserve_cycle,
 		  __entry->grant_reserve_bytes,
 		  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
 
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-		 int flags, struct xfs_bmbt_irec *irec),
-	TP_ARGS(ip, offset, count, flags, irec),
+		 int type, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, type, irec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
 		__field(loff_t, new_size)
 		__field(loff_t, offset)
 		__field(size_t, count)
-		__field(int, flags)
+		__field(int, type)
 		__field(xfs_fileoff_t, startoff)
 		__field(xfs_fsblock_t, startblock)
 		__field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
 		__entry->new_size = ip->i_new_size;
 		__entry->offset = offset;
 		__entry->count = count;
-		__entry->flags = flags;
+		__entry->type = type;
 		__entry->startoff = irec ? irec->br_startoff : 0;
 		__entry->startblock = irec ? irec->br_startblock : 0;
 		__entry->blockcount = irec ? irec->br_blockcount : 0;
 	),
 	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-		  "offset 0x%llx count %zd flags %s "
+		  "offset 0x%llx count %zd type %s "
 		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
 		  __entry->new_size,
 		  __entry->offset,
 		  __entry->count,
-		  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+		  __print_symbolic(__entry->type, XFS_IO_TYPES),
 		  __entry->startoff,
 		  (__int64_t)__entry->startblock,
 		  __entry->blockcount)
 )
 
 #define DEFINE_IOMAP_EVENT(name)	\
-DEFINE_EVENT(xfs_iomap_class, name,	\
+DEFINE_EVENT(xfs_imap_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
-		 int flags, struct xfs_bmbt_irec *irec),		\
-	TP_ARGS(ip, offset, count, flags, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+		 int type, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
 	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 
 
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
 	TP_PROTO(struct xfs_alloc_arg *args), \
 	TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 
+DECLARE_EVENT_CLASS(xfs_discard_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a1..d22aa310310 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
 	ASSERT(list_empty(&dqp->q_freelist));
 
 	mutex_destroy(&dqp->q_qlock);
-	sv_destroy(&dqp->q_pinwait);
 	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
 
 	atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a4..0df88897ef8 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 
-static char		message[1024];	/* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL	7
-#define XFS_ERR_MASK		((1 << 3) - 1)
-static const char * const	err_level[XFS_MAX_ERR_LEVEL+1] =
-					{KERN_EMERG, KERN_ALERT, KERN_CRIT,
-					 KERN_ERR, KERN_WARNING, KERN_NOTICE,
-					 KERN_INFO, KERN_DEBUG};
-
 void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+	const char	*lvl,
+	const char	*fmt,
+	...)
 {
-	char	*fp = fmt;
-	int	len;
-	ulong	flags;
-	va_list	ap;
-
-	level &= XFS_ERR_MASK;
-	if (level > XFS_MAX_ERR_LEVEL)
-		level = XFS_MAX_ERR_LEVEL;
-	spin_lock_irqsave(&xfs_err_lock,flags);
-	va_start(ap, fmt);
-	if (*fmt == '!') fp++;
-	len = vsnprintf(message, sizeof(message), fp, ap);
-	if (len >= sizeof(message))
-		len = sizeof(message) - 1;
-	if (message[len-1] == '\n')
-		message[len-1] = 0;
-	printk("%s%s\n", err_level[level], message);
-	va_end(ap);
-	spin_unlock_irqrestore(&xfs_err_lock,flags);
-	BUG_ON(level == CE_PANIC);
+	struct va_format vaf;
+	va_list		args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%s%pV", lvl, &vaf);
+	va_end(args);
+
+	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
 }
 
 void
-xfs_fs_vcmn_err(
-	int			level,
+xfs_fs_cmn_err(
+	const char		*lvl,
 	struct xfs_mount	*mp,
-	char			*fmt,
-	va_list			ap)
+	const char		*fmt,
+	...)
 {
-	unsigned long		flags;
-	int			len = 0;
+	struct va_format	vaf;
+	va_list			args;
 
-	level &= XFS_ERR_MASK;
-	if (level > XFS_MAX_ERR_LEVEL)
-		level = XFS_MAX_ERR_LEVEL;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 
-	spin_lock_irqsave(&xfs_err_lock,flags);
+	printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+	va_end(args);
 
-	if (mp) {
-		len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
+}
+
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+	int			panic_tag,
+	const char		*lvl,
+	struct xfs_mount	*mp,
+	const char		*fmt,
+	...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			do_panic = 0;
 
-		/*
-		 * Skip the printk if we can't print anything useful
-		 * due to an over-long device name.
-		 */
-		if (len >= sizeof(message))
-			goto out;
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+		printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
+		do_panic = 1;
 	}
 
-	len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
-	if (len >= sizeof(message))
-		len = sizeof(message) - 1;
-	if (message[len-1] == '\n')
-		message[len-1] = 0;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 
-	printk("%s%s\n", err_level[level], message);
- out:
-	spin_unlock_irqrestore(&xfs_err_lock,flags);
+	printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
+	va_end(args);
 
-	BUG_ON(level == CE_PANIC);
+	BUG_ON(do_panic);
 }
 
 void
 assfail(char *expr, char *file, int line)
 {
-	printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+	printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+	       file, line);
 	BUG();
 }
 
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4..05699f67d47 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
 
 #include <stdarg.h>
 
-#define CE_DEBUG        7               /* debug        */
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
-#define CE_WARN         4               /* warning      */
-#define CE_ALERT        1               /* alert        */
-#define CE_PANIC        0               /* panic        */
-
-extern void cmn_err(int, char *, ...)
-	__attribute__ ((format (printf, 2, 3)));
+struct xfs_mount;
+
+#define CE_DEBUG        KERN_DEBUG
+#define CE_CONT         KERN_INFO
+#define CE_NOTE         KERN_NOTICE
+#define CE_WARN         KERN_WARNING
+#define CE_ALERT        KERN_ALERT
+#define CE_PANIC        KERN_EMERG
+
+void cmn_err(const char *lvl, const char *fmt, ...)
+		__attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+		const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
+
 extern void assfail(char *expr, char *f, int l);
 
 #define ASSERT_ALWAYS(expr)	\
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c02..58632cc17f2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
-	rwlock_t	pag_ici_lock;	/* incore inode lock */
+	spinlock_t	pag_ici_lock;	/* incore inode cache lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca..f3227984a9b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-		    xfs_agblock_t bno, xfs_extlen_t len);
-
 /*
  * Prototypes for per-ag allocation routines
  */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
  * Lookup the first record less than or equal to [bno, len]
  * in the btree given by cur.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_alloc_lookup_le(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agblock_t		bno,	/* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
  * Get the data from the pointed-to record.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_alloc_get_rec(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agblock_t		*bno,	/* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
 	xfs_extlen_t	rlen;	/* length of returned extent */
 
 	ASSERT(args->alignment == 1);
+
 	/*
 	 * Allocate/initialize a cursor for the by-number freespace btree.
 	 */
 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_BNO);
+					  args->agno, XFS_BTNUM_BNO);
+
 	/*
 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
 	 * Look for the closest free block <= bno, it must contain bno
 	 * if any free block does.
 	 */
-	if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+	if (error)
 		goto error0;
-	if (!i) {
-		/*
-		 * Didn't find it, return null.
-		 */
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-		args->agbno = NULLAGBLOCK;
-		return 0;
-	}
+	if (!i)
+		goto not_found;
+
 	/*
 	 * Grab the freespace record.
 	 */
-	if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+	if (error)
 		goto error0;
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	ASSERT(fbno <= args->agbno);
 	minend = args->agbno + args->minlen;
 	maxend = args->agbno + args->maxlen;
 	fend = fbno + flen;
+
 	/*
 	 * Give up if the freespace isn't long enough for the minimum request.
 	 */
-	if (fend < minend) {
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-		args->agbno = NULLAGBLOCK;
-		return 0;
-	}
+	if (fend < minend)
+		goto not_found;
+
 	/*
 	 * End of extent will be smaller of the freespace end and the
 	 * maximal requested end.
-	 */
-	end = XFS_AGBLOCK_MIN(fend, maxend);
-	/*
+	 *
 	 * Fix the length according to mod and prod if given.
 	 */
+	end = XFS_AGBLOCK_MIN(fend, maxend);
 	args->len = end - args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
+	if (!xfs_alloc_fix_minleft(args))
+		goto not_found;
+
 	rlen = args->len;
 	ASSERT(args->agbno + rlen <= fend);
 	end = args->agbno + rlen;
+
 	/*
 	 * We are allocating agbno for rlen [agbno .. end]
 	 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
 		args->agno, XFS_BTNUM_CNT);
 	ASSERT(args->agbno + args->len <=
 		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
-			args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+				      args->len, XFSA_FIXUP_BNO_OK);
+	if (error) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 		goto error0;
 	}
+
 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 
-	trace_xfs_alloc_exact_done(args);
 	args->wasfromfl = 0;
+	trace_xfs_alloc_exact_done(args);
+	return 0;
+
+not_found:
+	/* Didn't find it, return null. */
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	args->agbno = NULLAGBLOCK;
+	trace_xfs_alloc_exact_notfound(args);
 	return 0;
 
 error0:
@@ -659,6 +661,95 @@ error0:
 }
 
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	**gcur,	/* good cursor */
+	struct xfs_btree_cur	**scur,	/* searching cursor */
+	xfs_agblock_t		gdiff,	/* difference for search comparison */
+	xfs_agblock_t		*sbno,	/* extent found by search */
+	xfs_extlen_t		*slen,
+	xfs_extlen_t		*slena,	/* aligned length */
+	int			dir)	/* 0 = search right, 1 = search left */
+{
+	xfs_agblock_t		bno;
+	xfs_agblock_t		new;
+	xfs_agblock_t		sdiff;
+	int			error;
+	int			i;
+
+	/* The good extent is perfect, no need to  search. */
+	if (!gdiff)
+		goto out_use_good;
+
+	/*
+	 * Look until we find a better one, run out of space or run off the end.
+	 */
+	do {
+		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+					  args->minlen, &bno, slena);
+
+		/*
+		 * The good extent is closer than this one.
+		 */
+		if (!dir) {
+			if (bno >= args->agbno + gdiff)
+				goto out_use_good;
+		} else {
+			if (bno <= args->agbno - gdiff)
+				goto out_use_good;
+		}
+
+		/*
+		 * Same distance, compare length and pick the best.
+		 */
+		if (*slena >= args->minlen) {
+			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+			xfs_alloc_fix_len(args);
+
+			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+						       args->alignment, *sbno,
+						       *slen, &new);
+
+			/*
+			 * Choose closer size and invalidate other cursor.
+			 */
+			if (sdiff < gdiff)
+				goto out_use_search;
+			goto out_use_good;
+		}
+
+		if (!dir)
+			error = xfs_btree_increment(*scur, 0, &i);
+		else
+			error = xfs_btree_decrement(*scur, 0, &i);
+		if (error)
+			goto error0;
+	} while (i);
+
+out_use_good:
+	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+	*scur = NULL;
+	return 0;
+
+out_use_search:
+	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+	*gcur = NULL;
+	return 0;
+
+error0:
+	/* caller invalidates cursors */
+	return error;
+}
+
+/*
  * Allocate a variable extent near bno in the allocation group agno.
  * Extent's length (returned in len) will be between minlen and maxlen,
  * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
 			}
 		}
 	} while (bno_cur_lt || bno_cur_gt);
+
 	/*
 	 * Got both cursors still active, need to find better entry.
 	 */
 	if (bno_cur_lt && bno_cur_gt) {
-		/*
-		 * Left side is long enough, look for a right side entry.
-		 */
 		if (ltlena >= args->minlen) {
 			/*
-			 * Fix up the length.
+			 * Left side is good, look for a right side entry.
 			 */
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-			rlen = args->len;
-			ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
 				args->alignment, ltbno, ltlen, &ltnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_lt, &bno_cur_gt,
+						ltdiff, &gtbno, &gtlen, &gtlena,
+						0 /* search right */);
+		} else {
+			ASSERT(gtlena >= args->minlen);
+
 			/*
-			 * Not perfect.
-			 */
-			if (ltdiff) {
-				/*
-				 * Look until we find a better one, run out of
-				 * space, or run off the end.
-				 */
-				while (bno_cur_lt && bno_cur_gt) {
-					if ((error = xfs_alloc_get_rec(
-							bno_cur_gt, &gtbno,
-							&gtlen, &i)))
-						goto error0;
-					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-					xfs_alloc_compute_aligned(gtbno, gtlen,
-						args->alignment, args->minlen,
-						&gtbnoa, &gtlena);
-					/*
-					 * The left one is clearly better.
-					 */
-					if (gtbnoa >= args->agbno + ltdiff) {
-						xfs_btree_del_cursor(
-							bno_cur_gt,
-							XFS_BTREE_NOERROR);
-						bno_cur_gt = NULL;
-						break;
-					}
-					/*
-					 * If we reach a big enough entry,
-					 * compare the two and pick the best.
-					 */
-					if (gtlena >= args->minlen) {
-						args->len =
-							XFS_EXTLEN_MIN(gtlena,
-								args->maxlen);
-						xfs_alloc_fix_len(args);
-						rlen = args->len;
-						gtdiff = xfs_alloc_compute_diff(
-							args->agbno, rlen,
-							args->alignment,
-							gtbno, gtlen, &gtnew);
-						/*
-						 * Right side is better.
-						 */
-						if (gtdiff < ltdiff) {
-							xfs_btree_del_cursor(
-								bno_cur_lt,
-								XFS_BTREE_NOERROR);
-							bno_cur_lt = NULL;
-						}
-						/*
-						 * Left side is better.
-						 */
-						else {
-							xfs_btree_del_cursor(
-								bno_cur_gt,
-								XFS_BTREE_NOERROR);
-							bno_cur_gt = NULL;
-						}
-						break;
-					}
-					/*
-					 * Fell off the right end.
-					 */
-					if ((error = xfs_btree_increment(
-							bno_cur_gt, 0, &i)))
-						goto error0;
-					if (!i) {
-						xfs_btree_del_cursor(
-							bno_cur_gt,
-							XFS_BTREE_NOERROR);
-						bno_cur_gt = NULL;
-						break;
-					}
-				}
-			}
-			/*
-			 * The left side is perfect, trash the right side.
-			 */
-			else {
-				xfs_btree_del_cursor(bno_cur_gt,
-						     XFS_BTREE_NOERROR);
-				bno_cur_gt = NULL;
-			}
-		}
-		/*
-		 * It's the right side that was found first, look left.
-		 */
-		else {
-			/*
-			 * Fix up the length.
+			 * Right side is good, look for a left side entry.
 			 */
 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-			rlen = args->len;
-			gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
 				args->alignment, gtbno, gtlen, &gtnew);
-			/*
-			 * Right side entry isn't perfect.
-			 */
-			if (gtdiff) {
-				/*
-				 * Look until we find a better one, run out of
-				 * space, or run off the end.
-				 */
-				while (bno_cur_lt && bno_cur_gt) {
-					if ((error = xfs_alloc_get_rec(
-							bno_cur_lt, &ltbno,
-							&ltlen, &i)))
-						goto error0;
-					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-					xfs_alloc_compute_aligned(ltbno, ltlen,
-						args->alignment, args->minlen,
-						&ltbnoa, &ltlena);
-					/*
-					 * The right one is clearly better.
-					 */
-					if (ltbnoa <= args->agbno - gtdiff) {
-						xfs_btree_del_cursor(
-							bno_cur_lt,
-							XFS_BTREE_NOERROR);
-						bno_cur_lt = NULL;
-						break;
-					}
-					/*
-					 * If we reach a big enough entry,
-					 * compare the two and pick the best.
-					 */
-					if (ltlena >= args->minlen) {
-						args->len = XFS_EXTLEN_MIN(
-							ltlena, args->maxlen);
-						xfs_alloc_fix_len(args);
-						rlen = args->len;
-						ltdiff = xfs_alloc_compute_diff(
-							args->agbno, rlen,
-							args->alignment,
-							ltbno, ltlen, &ltnew);
-						/*
-						 * Left side is better.
-						 */
-						if (ltdiff < gtdiff) {
-							xfs_btree_del_cursor(
-								bno_cur_gt,
-								XFS_BTREE_NOERROR);
-							bno_cur_gt = NULL;
-						}
-						/*
-						 * Right side is better.
-						 */
-						else {
-							xfs_btree_del_cursor(
-								bno_cur_lt,
-								XFS_BTREE_NOERROR);
-							bno_cur_lt = NULL;
-						}
-						break;
-					}
-					/*
-					 * Fell off the left end.
-					 */
-					if ((error = xfs_btree_decrement(
-							bno_cur_lt, 0, &i)))
-						goto error0;
-					if (!i) {
-						xfs_btree_del_cursor(bno_cur_lt,
-							XFS_BTREE_NOERROR);
-						bno_cur_lt = NULL;
-						break;
-					}
-				}
-			}
-			/*
-			 * The right side is perfect, trash the left side.
-			 */
-			else {
-				xfs_btree_del_cursor(bno_cur_lt,
-					XFS_BTREE_NOERROR);
-				bno_cur_lt = NULL;
-			}
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_gt, &bno_cur_lt,
+						gtdiff, &ltbno, &ltlen, &ltlena,
+						1 /* search left */);
 		}
+
+		if (error)
+			goto error0;
 	}
+
 	/*
 	 * If we couldn't get anything, give up.
 	 */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
 		args->agbno = NULLAGBLOCK;
 		return 0;
 	}
+
 	/*
 	 * At this point we have selected a freespace entry, either to the
 	 * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
 		j = 1;
 	} else
 		j = 0;
+
 	/*
 	 * Fix up the length and compute the useful address.
 	 */
@@ -2676,7 +2611,7 @@ restart:
  * will require a synchronous transaction, but it can still be
  * used to distinguish between a partial or exact match.
  */
-static int
+int
 xfs_alloc_busy_search(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a9727..0ab56b32c7e 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define	__XFS_ALLOC_H__
 
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
 #ifdef __KERNEL__
-
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
-		xfs_agnumber_t agno,
-		xfs_agblock_t bno,
-		xfs_extlen_t len);
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len);
 
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
 
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t bno, xfs_extlen_t len);
 #endif	/* __KERNEL__ */
 
 /*
@@ -205,4 +206,18 @@ xfs_free_extent(
 	xfs_fsblock_t	bno,	/* starting block number of extent */
 	xfs_extlen_t	len);	/* length of extent */
 
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat);	/* output: success/failure */
+
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb..71e90dc2aeb 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
 	sbsize = sf->hdr.count * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
 
 	/*
 	 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 				args.dp = context->dp;
 				args.whichfork = XFS_ATTR_FORK;
 				args.valuelen = valuelen;
-				args.value = kmem_alloc(valuelen, KM_SLEEP);
+				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
 				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
 				args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
 				retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7..2f9e97c128a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
 		return error;
 	}
 	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-	if (bp != NULL) {
+	if (bp)
 		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-	}
 	*bpp = bp;
 	return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
 	switch (cur->bc_btnum) {
 	case XFS_BTNUM_BNO:
 	case XFS_BTNUM_CNT:
-		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
 		break;
 	case XFS_BTNUM_INO:
-		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
 		break;
 	case XFS_BTNUM_BMAP:
-		XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
 		break;
 	default:
 		ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5..98c6f73b675 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
 #define		xfs_buf_item_log_check(x)
 #endif
 
-STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
+STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
 
 /*
  * This returns the number of log iovecs needed to log the
@@ -450,7 +449,7 @@ xfs_buf_item_unpin(
 		 * xfs_trans_ail_delete() drops the AIL lock.
 		 */
 		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-			xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+			xfs_buf_do_callbacks(bp);
 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
 			XFS_BUF_CLR_IODONE_FUNC(bp);
 		} else {
@@ -918,15 +917,26 @@ xfs_buf_attach_iodone(
 	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
 
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-	xfs_buf_t	*bp,
-	xfs_log_item_t	*lip)
+	struct xfs_buf		*bp)
 {
-	xfs_log_item_t	*nlip;
+	struct xfs_log_item	*lip;
 
-	while (lip != NULL) {
-		nlip = lip->li_bio_list;
+	while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
+		XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
 		ASSERT(lip->li_cb != NULL);
 		/*
 		 * Clear the next pointer so we don't have any
@@ -936,7 +946,6 @@ xfs_buf_do_callbacks(
 		 */
 		lip->li_bio_list = NULL;
 		lip->li_cb(bp, lip);
-		lip = nlip;
 	}
 }
 
@@ -949,128 +958,76 @@ xfs_buf_do_callbacks(
  */
 void
 xfs_buf_iodone_callbacks(
-	xfs_buf_t	*bp)
+	struct xfs_buf		*bp)
 {
-	xfs_log_item_t	*lip;
-	static ulong	lasttime;
-	static xfs_buftarg_t *lasttarg;
-	xfs_mount_t	*mp;
+	struct xfs_log_item	*lip = bp->b_fspriv;
+	struct xfs_mount	*mp = lip->li_mountp;
+	static ulong		lasttime;
+	static xfs_buftarg_t	*lasttarg;
 
-	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	if (likely(!XFS_BUF_GETERROR(bp)))
+		goto do_callbacks;
 
-	if (XFS_BUF_GETERROR(bp) != 0) {
-		/*
-		 * If we've already decided to shutdown the filesystem
-		 * because of IO errors, there's no point in giving this
-		 * a retry.
-		 */
-		mp = lip->li_mountp;
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-			XFS_BUF_SUPER_STALE(bp);
-			trace_xfs_buf_item_iodone(bp, _RET_IP_);
-			xfs_buf_do_callbacks(bp, lip);
-			XFS_BUF_SET_FSPRIVATE(bp, NULL);
-			XFS_BUF_CLR_IODONE_FUNC(bp);
-			xfs_buf_ioend(bp, 0);
-			return;
-		}
+	/*
+	 * If we've already decided to shutdown the filesystem because of
+	 * I/O errors, there's no point in giving this a retry.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		XFS_BUF_SUPER_STALE(bp);
+		trace_xfs_buf_item_iodone(bp, _RET_IP_);
+		goto do_callbacks;
+	}
 
-		if ((XFS_BUF_TARGET(bp) != lasttarg) ||
-		    (time_after(jiffies, (lasttime + 5*HZ)))) {
-			lasttime = jiffies;
-			cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-					" block 0x%llx in %s",
-				XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-			      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
-		}
-		lasttarg = XFS_BUF_TARGET(bp);
+	if (XFS_BUF_TARGET(bp) != lasttarg ||
+	    time_after(jiffies, (lasttime + 5*HZ))) {
+		lasttime = jiffies;
+		cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+				" block 0x%llx in %s",
+			XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+		      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+	}
+	lasttarg = XFS_BUF_TARGET(bp);
 
-		if (XFS_BUF_ISASYNC(bp)) {
-			/*
-			 * If the write was asynchronous then noone will be
-			 * looking for the error.  Clear the error state
-			 * and write the buffer out again delayed write.
-			 *
-			 * XXXsup This is OK, so long as we catch these
-			 * before we start the umount; we don't want these
-			 * DELWRI metadata bufs to be hanging around.
-			 */
-			XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
-
-			if (!(XFS_BUF_ISSTALE(bp))) {
-				XFS_BUF_DELAYWRITE(bp);
-				XFS_BUF_DONE(bp);
-				XFS_BUF_SET_START(bp);
-			}
-			ASSERT(XFS_BUF_IODONE_FUNC(bp));
-			trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-			xfs_buf_relse(bp);
-		} else {
-			/*
-			 * If the write of the buffer was not asynchronous,
-			 * then we want to make sure to return the error
-			 * to the caller of bwrite().  Because of this we
-			 * cannot clear the B_ERROR state at this point.
-			 * Instead we install a callback function that
-			 * will be called when the buffer is released, and
-			 * that routine will clear the error state and
-			 * set the buffer to be written out again after
-			 * some delay.
-			 */
-			/* We actually overwrite the existing b-relse
-			   function at times, but we're gonna be shutting down
-			   anyway. */
-			XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+	/*
+	 * If the write was asynchronous then noone will be looking for the
+	 * error.  Clear the error state and write the buffer out again.
+	 *
+	 * During sync or umount we'll write all pending buffers again
+	 * synchronous, which will catch these errors if they keep hanging
+	 * around.
+	 */
+	if (XFS_BUF_ISASYNC(bp)) {
+		XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+
+		if (!XFS_BUF_ISSTALE(bp)) {
+			XFS_BUF_DELAYWRITE(bp);
 			XFS_BUF_DONE(bp);
-			XFS_BUF_FINISH_IOWAIT(bp);
+			XFS_BUF_SET_START(bp);
 		}
+		ASSERT(XFS_BUF_IODONE_FUNC(bp));
+		trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+		xfs_buf_relse(bp);
 		return;
 	}
 
-	xfs_buf_do_callbacks(bp, lip);
-	XFS_BUF_SET_FSPRIVATE(bp, NULL);
-	XFS_BUF_CLR_IODONE_FUNC(bp);
-	xfs_buf_ioend(bp, 0);
-}
-
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-	xfs_buf_t	*bp)
-{
-	xfs_log_item_t	*lip;
-	xfs_mount_t	*mp;
-
-	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-	mp = (xfs_mount_t *)lip->li_mountp;
-	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-
+	/*
+	 * If the write of the buffer was synchronous, we want to make
+	 * sure to return the error to the caller of xfs_bwrite().
+	 */
 	XFS_BUF_STALE(bp);
 	XFS_BUF_DONE(bp);
 	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_ERROR(bp,0);
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
+	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 
-	if (! XFS_FORCED_SHUTDOWN(mp))
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	/*
-	 * We have to unpin the pinned buffers so do the
-	 * callbacks.
-	 */
-	xfs_buf_do_callbacks(bp, lip);
+do_callbacks:
+	xfs_buf_do_callbacks(bp);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
-	xfs_buf_relse(bp);
+	xfs_buf_ioend(bp, 0);
 }
 
-
 /*
  * This is the iodone() function for buffers which have been
  * logged.  It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c..b6ecd2061e7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
 	xfs_buf_log_format_t	bli_format;	/* in-log header */
 } xfs_buf_log_item_t;
 
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-	xfs_daddr_t		bc_blkno;
-	uint			bc_len;
-	int			bc_refcount;
-	struct xfs_buf_cancel	*bc_next;
-} xfs_buf_cancel_t;
-
 void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void	xfs_buf_item_relse(struct xfs_buf *);
 void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87..4c7db74a05f 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
 
-
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	xfs_fs_vcmn_err(level, mp, fmt, ap);
-	va_end(ap);
-}
-
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-	va_list ap;
-
-#ifdef DEBUG
-	xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-
-	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-	    && (level & CE_ALERT)) {
-		level &= ~CE_ALERT;
-		level |= CE_PANIC;
-		cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-	}
-	va_start(ap, fmt);
-	xfs_fs_vcmn_err(level, mp, fmt, ap);
-	va_end(ap);
-}
-
 void
 xfs_error_report(
 	const char		*tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b..10dce5475f0 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
 			(rf))))
 
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
 #define xfs_errortag_add(tag, mp)		(ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 
 struct xfs_mount;
 
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-		char *fmt, va_list ap)
-	__attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-			char *fmt, ...)
-	__attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-
 extern void xfs_hex_dump(void *p, int length);
 
 #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
 	xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
 
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
-	((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+	do { \
+		if (!(f & XFS_MFSI_QUIET)) 	\
+			cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+	} while (0)
 
 #endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf56..75f2ef60e57 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+	struct xfs_efi_log_item	*efip)
+{
+	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
+
+	if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+		spin_lock(&ailp->xa_lock);
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		xfs_trans_ail_delete(ailp, &efip->efi_item);
+		xfs_efi_item_free(efip);
+	}
+}
+
+/*
  * This returns the number of iovecs needed to log the given efi item.
  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
  * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
 	uint			size;
 
-	ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+	ASSERT(atomic_read(&efip->efi_next_extent) ==
+				efip->efi_format.efi_nextents);
 
 	efip->efi_format.efi_type = XFS_LI_EFI;
 
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
- * last place at which the EFI is manipulated during a transaction.
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
- * free the EFI.
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction.  If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
  */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
 	int			remove)
 {
 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
-	struct xfs_ail		*ailp = lip->li_ailp;
-
-	spin_lock(&ailp->xa_lock);
-	if (efip->efi_flags & XFS_EFI_CANCELED) {
-		if (remove)
-			xfs_trans_del_item(lip);
 
-		/* xfs_trans_ail_delete() drops the AIL lock. */
-		xfs_trans_ail_delete(ailp, lip);
+	if (remove) {
+		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+		xfs_trans_del_item(lip);
 		xfs_efi_item_free(efip);
-	} else {
-		efip->efi_flags |= XFS_EFI_COMMITTED;
-		spin_unlock(&ailp->xa_lock);
+		return;
 	}
+	__xfs_efi_release(efip);
 }
 
 /*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
 }
 
 /*
- * The EFI is logged only once and cannot be moved in the log, so
- * simply return the lsn at which it's been logged.  The canceled
- * flag is not paid any attention here.  Checking for that is delayed
- * until the EFI is unpinned.
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.  For bulk transaction committed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
  */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+
+	set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
 	return lsn;
 }
 
@@ -230,6 +253,7 @@ xfs_efi_init(
 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
 	efip->efi_format.efi_nextents = nextents;
 	efip->efi_format.efi_id = (__psint_t)(void*)efip;
+	atomic_set(&efip->efi_next_extent, 0);
 
 	return efip;
 }
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 
 /*
- * This is called by the efd item code below to release references to
- * the given efi item.  Each efd calls this with the number of
- * extents that it has logged, and when the sum of these reaches
- * the total number of extents logged by this efi item we can free
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
+ * This is called by the efd item code below to release references to the given
+ * efi item.  Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
  */
 void
 xfs_efi_release(xfs_efi_log_item_t	*efip,
 		uint			nextents)
 {
-	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
-	int			extents_left;
-
-	ASSERT(efip->efi_next_extent > 0);
-	ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-
-	spin_lock(&ailp->xa_lock);
-	ASSERT(efip->efi_next_extent >= nextents);
-	efip->efi_next_extent -= nextents;
-	extents_left = efip->efi_next_extent;
-	if (extents_left == 0) {
-		/* xfs_trans_ail_delete() drops the AIL lock. */
-		xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-		xfs_efi_item_free(efip);
-	} else {
-		spin_unlock(&ailp->xa_lock);
-	}
+	ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+	if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+		__xfs_efi_release(efip);
 }
 
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf6..375f68e4253 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define	XFS_EFI_MAX_FAST_EXTENTS	16
 
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
  */
-#define	XFS_EFI_RECOVERED	0x1
-#define	XFS_EFI_COMMITTED	0x2
-#define	XFS_EFI_CANCELED	0x4
+#define	XFS_EFI_RECOVERED	1
+#define	XFS_EFI_COMMITTED	2
 
 /*
  * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
  */
 typedef struct xfs_efi_log_item {
 	xfs_log_item_t		efi_item;
-	uint			efi_flags;	/* misc flags */
-	uint			efi_next_extent;
+	atomic_t		efi_next_extent;
+	unsigned long		efi_flags;	/* misc flags */
 	xfs_efi_log_format_t	efi_format;
 } xfs_efi_log_item_t;
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814a..cec89dd5d7d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
 		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
 	} else
 		mp->m_maxicount = 0;
+	xfs_set_low_space_thresholds(mp);
 
 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
@@ -611,12 +612,13 @@ out:
  *
  * We cannot use an inode here for this - that will push dirty state back up
  * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
  */
 int
 xfs_fs_log_dummy(
-	xfs_mount_t	*mp,
-	int		flags)
+	xfs_mount_t	*mp)
 {
 	xfs_trans_t	*tp;
 	int		error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
 
 	/* log the UUID because it is an unchanging field */
 	xfs_mod_sb(tp, XFS_SB_UUID);
-	if (flags & SYNC_WAIT)
-		xfs_trans_set_sync(tp);
+	xfs_trans_set_sync(tp);
 	return xfs_trans_commit(tp, 0);
 }
 
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1..1b6a98b6688 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d7de5a3f786..cb9b6d1469f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 
 
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+
+/*
  * Allocate and initialise an xfs_inode.
  */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
+	ASSERT(ip->i_ino == 0);
 
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+			&xfs_iolock_active, "xfs_iolock_active");
 
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
 	ip->i_size = 0;
 	ip->i_new_size = 0;
 
-	/* prevent anyone from using this yet */
-	VFS_I(ip)->i_state = I_NEW;
-
 	return ip;
 }
 
@@ -145,7 +156,18 @@ xfs_inode_free(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
-	call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 
 /*
@@ -155,14 +177,29 @@ static int
 xfs_iget_cache_hit(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
 	int			flags,
-	int			lock_flags) __releases(pag->pag_ici_lock)
+	int			lock_flags) __releases(RCU)
 {
 	struct inode		*inode = VFS_I(ip);
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
 
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
 	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
 
 	/*
 	 * If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
 		ip->i_flags |= XFS_IRECLAIM;
 
 		spin_unlock(&ip->i_flags_lock);
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 
 		error = -inode_init_always(mp->m_super, inode);
 		if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
 			 * Re-initializing the inode failed, and we are in deep
 			 * trouble.  Try to re-add it to the reclaim list.
 			 */
-			read_lock(&pag->pag_ici_lock);
+			rcu_read_lock();
 			spin_lock(&ip->i_flags_lock);
 
 			ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
 			goto out_error;
 		}
 
-		write_lock(&pag->pag_ici_lock);
+		spin_lock(&pag->pag_ici_lock);
 		spin_lock(&ip->i_flags_lock);
 		ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
 		ip->i_flags |= XFS_INEW;
 		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
 		inode->i_state = I_NEW;
+
+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+		lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+				&xfs_iolock_active, "xfs_iolock_active");
+
 		spin_unlock(&ip->i_flags_lock);
-		write_unlock(&pag->pag_ici_lock);
+		spin_unlock(&pag->pag_ici_lock);
 	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
 
 		/* We've got a live one. */
 		spin_unlock(&ip->i_flags_lock);
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 		trace_xfs_iget_hit(ip);
 	}
 
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
 
 out_error:
 	spin_unlock(&ip->i_flags_lock);
-	read_unlock(&pag->pag_ici_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
 			BUG();
 	}
 
-	write_lock(&pag->pag_ici_lock);
+	spin_lock(&pag->pag_ici_lock);
 
 	/* insert the new inode */
 	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
 	ip->i_udquot = ip->i_gdquot = NULL;
 	xfs_iflags_set(ip, XFS_INEW);
 
-	write_unlock(&pag->pag_ici_lock);
+	spin_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
 
 	*ipp = ip;
 	return 0;
 
 out_preload_end:
-	write_unlock(&pag->pag_ici_lock);
+	spin_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
 	xfs_agino_t	agino;
 
 	/* reject inode numbers outside existing AGs */
-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
 
 again:
 	error = 0;
-	read_lock(&pag->pag_ici_lock);
+	rcu_read_lock();
 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
 	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 		if (error)
 			goto out_error_or_again;
 	} else {
-		read_unlock(&pag->pag_ici_lock);
+		rcu_read_unlock();
 		XFS_STATS_INC(xs_ig_missed);
 
 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f9..be7cf625421 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
 	 * around for a while.  This helps to keep recently accessed
 	 * meta-data in-core longer.
 	 */
-	XFS_BUF_SET_REF(bp, XFS_INO_REF);
+	xfs_buf_set_ref(bp, XFS_INO_REF);
 
 	/*
 	 * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
 		 */
 		for (i = 0; i < ninodes; i++) {
 retry:
-			read_lock(&pag->pag_ici_lock);
+			rcu_read_lock();
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
 
-			/* Inode not in memory or stale, nothing to do */
-			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
-				read_unlock(&pag->pag_ici_lock);
+			/* Inode not in memory, nothing to do */
+			if (!ip) {
+				rcu_read_unlock();
 				continue;
 			}
 
 			/*
+			 * because this is an RCU protected lookup, we could
+			 * find a recently freed or even reallocated inode
+			 * during the lookup. We need to check under the
+			 * i_flags_lock for a valid inode here. Skip it if it
+			 * is not valid, the wrong inode or stale.
+			 */
+			spin_lock(&ip->i_flags_lock);
+			if (ip->i_ino != inum + i ||
+			    __xfs_iflags_test(ip, XFS_ISTALE)) {
+				spin_unlock(&ip->i_flags_lock);
+				rcu_read_unlock();
+				continue;
+			}
+			spin_unlock(&ip->i_flags_lock);
+
+			/*
 			 * Don't try to lock/unlock the current inode, but we
 			 * _cannot_ skip the other inodes that we did not find
 			 * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
 			 */
 			if (ip != free_ip &&
 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				read_unlock(&pag->pag_ici_lock);
+				rcu_read_unlock();
 				delay(1);
 				goto retry;
 			}
-			read_unlock(&pag->pag_ici_lock);
+			rcu_read_unlock();
 
 			xfs_iflock(ip);
 			xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
 
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-	read_lock(&pag->pag_ici_lock);
+	rcu_read_lock();
 	/* really need a gang lookup range call here */
 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
 					first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
 		iq = ilist[i];
 		if (iq == ip)
 			continue;
-		/* if the inode lies outside this cluster, we're done. */
-		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
-			break;
+
+		/*
+		 * because this is an RCU protected lookup, we could find a
+		 * recently freed or even reallocated inode during the lookup.
+		 * We need to check under the i_flags_lock for a valid inode
+		 * here. Skip it if it is not valid or the wrong inode.
+		 */
+		spin_lock(&ip->i_flags_lock);
+		if (!ip->i_ino ||
+		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+			spin_unlock(&ip->i_flags_lock);
+			continue;
+		}
+		spin_unlock(&ip->i_flags_lock);
+
 		/*
 		 * Do an un-protected check to see if the inode is dirty and
 		 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
 	}
 
 out_free:
-	read_unlock(&pag->pag_ici_lock);
+	rcu_read_unlock();
 	kmem_free(ilist);
 out_put:
 	xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
 	 * Corruption detected in the clustering loop.  Invalidate the
 	 * inode buffer and shut down the filesystem.
 	 */
-	read_unlock(&pag->pag_ici_lock);
+	rcu_read_unlock();
 	/*
 	 * Clean up the buffer.  If it was B_DELWRI, just release it --
 	 * brelse can handle it with no problems.  If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc..5c95fa8ec11 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
-#define XFS_ISTALE	0x0002	/* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW	0x0008	/* inode has just been allocated */
-#define XFS_IFILESTREAM	0x0010	/* inode is in a filestream directory */
-#define XFS_ITRUNCATED	0x0020	/* truncated down so flush-on-close */
+#define XFS_IRECLAIM		0x0001  /* started reclaiming this inode */
+#define XFS_ISTALE		0x0002	/* inode has been staled */
+#define XFS_IRECLAIMABLE	0x0004	/* inode can be reclaimed */
+#define XFS_INEW		0x0008	/* inode has just been allocated */
+#define XFS_IFILESTREAM		0x0010	/* inode is in a filestream directory */
+#define XFS_ITRUNCATED		0x0020	/* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE	0x0040	/* dirty release already seen */
 
 /*
  * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 
+extern struct lock_class_key xfs_iolock_reclaimable;
+
 /*
  * Flags for xfs_itruncate_start().
  */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c..fd4f398bd6f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
  * flushed to disk.  It is responsible for removing the inode item
  * from the AIL if it has not been re-logged, and unlocking the inode's
  * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
  */
 void
 xfs_iflush_done(
 	struct xfs_buf		*bp,
 	struct xfs_log_item	*lip)
 {
-	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-	xfs_inode_t		*ip = iip->ili_inode;
+	struct xfs_inode_log_item *iip;
+	struct xfs_log_item	*blip;
+	struct xfs_log_item	*next;
+	struct xfs_log_item	*prev;
 	struct xfs_ail		*ailp = lip->li_ailp;
+	int			need_ail = 0;
+
+	/*
+	 * Scan the buffer IO completions for other inodes being completed and
+	 * attach them to the current inode log item.
+	 */
+	blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	prev = NULL;
+	while (blip != NULL) {
+		if (lip->li_cb != xfs_iflush_done) {
+			prev = blip;
+			blip = blip->li_bio_list;
+			continue;
+		}
+
+		/* remove from list */
+		next = blip->li_bio_list;
+		if (!prev) {
+			XFS_BUF_SET_FSPRIVATE(bp, next);
+		} else {
+			prev->li_bio_list = next;
+		}
+
+		/* add to current list */
+		blip->li_bio_list = lip->li_bio_list;
+		lip->li_bio_list = blip;
+
+		/*
+		 * while we have the item, do the unlocked check for needing
+		 * the AIL lock.
+		 */
+		iip = INODE_ITEM(blip);
+		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+			need_ail++;
+
+		blip = next;
+	}
+
+	/* make sure we capture the state of the initial inode. */
+	iip = INODE_ITEM(lip);
+	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+		need_ail++;
 
 	/*
 	 * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
 	 * the lock since it's cheaper, and then we recheck while
 	 * holding the lock before removing the inode from the AIL.
 	 */
-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+	if (need_ail) {
+		struct xfs_log_item *log_items[need_ail];
+		int i = 0;
 		spin_lock(&ailp->xa_lock);
-		if (lip->li_lsn == iip->ili_flush_lsn) {
-			/* xfs_trans_ail_delete() drops the AIL lock. */
-			xfs_trans_ail_delete(ailp, lip);
-		} else {
-			spin_unlock(&ailp->xa_lock);
+		for (blip = lip; blip; blip = blip->li_bio_list) {
+			iip = INODE_ITEM(blip);
+			if (iip->ili_logged &&
+			    blip->li_lsn == iip->ili_flush_lsn) {
+				log_items[i++] = blip;
+			}
+			ASSERT(i <= need_ail);
 		}
+		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+		xfs_trans_ail_delete_bulk(ailp, log_items, i);
 	}
 
-	iip->ili_logged = 0;
 
 	/*
-	 * Clear the ili_last_fields bits now that we know that the
-	 * data corresponding to them is safely on disk.
+	 * clean up and unlock the flush lock now we are done. We can clear the
+	 * ili_last_fields bits now that we know that the data corresponding to
+	 * them is safely on disk.
 	 */
-	iip->ili_last_fields = 0;
+	for (blip = lip; blip; blip = next) {
+		next = blip->li_bio_list;
+		blip->li_bio_list = NULL;
 
-	/*
-	 * Release the inode's flush lock since we're done with it.
-	 */
-	xfs_ifunlock(ip);
+		iip = INODE_ITEM(blip);
+		iip->ili_logged = 0;
+		iip->ili_last_fields = 0;
+		xfs_ifunlock(iip->ili_inode);
+	}
 }
 
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369..55582bd6665 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS	2
 #define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
 
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-				  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-				 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-				struct xfs_bmbt_irec *, int *);
-
-int
-xfs_iomap(
-	struct xfs_inode	*ip,
-	xfs_off_t		offset,
-	ssize_t			count,
-	int			flags,
-	struct xfs_bmbt_irec	*imap,
-	int			*nimaps,
-	int			*new)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb, end_fsb;
-	int			error = 0;
-	int			lockmode = 0;
-	int			bmapi_flags = 0;
-
-	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-
-	*new = 0;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-
-	switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-	case BMAPI_READ:
-		lockmode = xfs_ilock_map_shared(ip);
-		bmapi_flags = XFS_BMAPI_ENTIRE;
-		break;
-	case BMAPI_WRITE:
-		lockmode = XFS_ILOCK_EXCL;
-		if (flags & BMAPI_IGNSTATE)
-			bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-		xfs_ilock(ip, lockmode);
-		break;
-	case BMAPI_ALLOCATE:
-		lockmode = XFS_ILOCK_SHARED;
-		bmapi_flags = XFS_BMAPI_ENTIRE;
-
-		/* Attempt non-blocking lock */
-		if (flags & BMAPI_TRYLOCK) {
-			if (!xfs_ilock_nowait(ip, lockmode))
-				return XFS_ERROR(EAGAIN);
-		} else {
-			xfs_ilock(ip, lockmode);
-		}
-		break;
-	default:
-		BUG();
-	}
-
-	ASSERT(offset <= mp->m_maxioffset);
-	if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-		count = mp->m_maxioffset - offset;
-	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-	error = xfs_bmapi(NULL, ip, offset_fsb,
-			(xfs_filblks_t)(end_fsb - offset_fsb),
-			bmapi_flags,  NULL, 0, imap,
-			nimaps, NULL);
-
-	if (error)
-		goto out;
-
-	switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-	case BMAPI_WRITE:
-		/* If we found an extent, return it */
-		if (*nimaps &&
-		    (imap->br_startblock != HOLESTARTBLOCK) &&
-		    (imap->br_startblock != DELAYSTARTBLOCK)) {
-			trace_xfs_iomap_found(ip, offset, count, flags, imap);
-			break;
-		}
-
-		if (flags & BMAPI_DIRECT) {
-			error = xfs_iomap_write_direct(ip, offset, count, flags,
-						       imap, nimaps);
-		} else {
-			error = xfs_iomap_write_delay(ip, offset, count, flags,
-						      imap, nimaps);
-		}
-		if (!error) {
-			trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-		}
-		*new = 1;
-		break;
-	case BMAPI_ALLOCATE:
-		/* If we found an extent, return it */
-		xfs_iunlock(ip, lockmode);
-		lockmode = 0;
-
-		if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-			trace_xfs_iomap_found(ip, offset, count, flags, imap);
-			break;
-		}
-
-		error = xfs_iomap_write_allocate(ip, offset, count,
-						 imap, nimaps);
-		break;
-	}
-
-	ASSERT(*nimaps <= 1);
-
-out:
-	if (lockmode)
-		xfs_iunlock(ip, lockmode);
-	return XFS_ERROR(error);
-}
-
 STATIC int
 xfs_iomap_eof_align_last_fsb(
 	xfs_mount_t	*mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
 	return EFSCORRUPTED;
 }
 
-STATIC int
+int
 xfs_iomap_write_direct(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	int		flags,
 	xfs_bmbt_irec_t *imap,
-	int		*nmaps)
+	int		nmaps)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
 		if (error)
 			goto error_out;
 	} else {
-		if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
 					imap->br_blockcount +
 					imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
 	xfs_trans_ijoin(tp, ip);
 
 	bmapi_flag = XFS_BMAPI_WRITE;
-	if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+	if (offset < ip->i_size || extsz)
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
 		goto error_out;
 	}
 
-	*nmaps = 1;
 	return 0;
 
 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 
 error1:	/* Just cancel transaction */
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-	*nmaps = 0;	/* nothing set-up here */
 
 error_out:
 	return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
  * If the caller is doing a write at the end of the file, then extend the
  * allocation out to the file system's write iosize.  We clean up any extra
  * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	int		ioflag,
 	xfs_bmbt_irec_t *imap,
 	int		nimaps,
 	int		*prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
 	xfs_filblks_t   count_fsb;
 	xfs_fsblock_t	firstblock;
 	int		n, error, imaps;
+	int		found_delalloc = 0;
 
 	*prealloc = 0;
 	if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
 				return 0;
 			start_fsb += imap[n].br_blockcount;
 			count_fsb -= imap[n].br_blockcount;
+
+			if (imap[n].br_startblock == DELAYSTARTBLOCK)
+				found_delalloc = 1;
 		}
 	}
-	*prealloc = 1;
+	if (!found_delalloc)
+		*prealloc = 1;
 	return 0;
 }
 
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip)
+{
+	xfs_fsblock_t		alloc_blocks = 0;
+
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		int shift = 0;
+		int64_t freesp;
+
+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+					rounddown_pow_of_two(alloc_blocks));
+
+		xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+		freesp = mp->m_sb.sb_fdblocks;
+		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+			shift = 2;
+			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+				shift++;
+		}
+		if (shift)
+			alloc_blocks >>= shift;
+	}
+
+	if (alloc_blocks < mp->m_writeio_blocks)
+		alloc_blocks = mp->m_writeio_blocks;
+
+	return alloc_blocks;
+}
+
+int
 xfs_iomap_write_delay(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	int		ioflag,
-	xfs_bmbt_irec_t *ret_imap,
-	int		*nmaps)
+	xfs_bmbt_irec_t *ret_imap)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
+
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+				imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
 		return error;
 
 retry:
 	if (prealloc) {
+		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-		last_fsb = ioalign + mp->m_writeio_blocks;
+		last_fsb = ioalign + alloc_blocks;
 	} else {
 		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
 	}
@@ -496,22 +426,31 @@ retry:
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
 			  &nimaps, NULL);
-	if (error && (error != ENOSPC))
+	switch (error) {
+	case 0:
+	case ENOSPC:
+	case EDQUOT:
+		break;
+	default:
 		return XFS_ERROR(error);
+	}
 
 	/*
-	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-	 * then we must have run out of space - flush all other inodes with
-	 * delalloc blocks and retry without EOF preallocation.
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
+	 * some of the excess reserved metadata space. For both cases, retry
+	 * without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
 		if (flushed)
-			return XFS_ERROR(ENOSPC);
+			return XFS_ERROR(error ? error : ENOSPC);
 
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_flush_inodes(ip);
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (error == ENOSPC) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_flush_inodes(ip);
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		}
 
 		flushed = 1;
 		error = 0;
@@ -523,8 +462,6 @@ retry:
 		return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
 
 	*ret_imap = imap[0];
-	*nmaps = 1;
-
 	return 0;
 }
 
@@ -538,13 +475,12 @@ retry:
  * We no longer bother to look at the incoming map - all we have to
  * guarantee is that whatever we allocate fills the required range.
  */
-STATIC int
+int
 xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	xfs_bmbt_irec_t *imap,
-	int		*retmap)
+	xfs_bmbt_irec_t *imap)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
 	int		error = 0;
 	int		nres;
 
-	*retmap = 0;
-
 	/*
 	 * Make sure that the dquots are there.
 	 */
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
 		if ((offset_fsb >= imap->br_startoff) &&
 		    (offset_fsb < (imap->br_startoff +
 				   imap->br_blockcount))) {
-			*retmap = 1;
 			XFS_STATS_INC(xs_xstrat_quick);
 			return 0;
 		}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50..80615760959 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
-/* base extent manipulation calls */
-#define BMAPI_READ	(1 << 0)	/* read extents */
-#define BMAPI_WRITE	(1 << 1)	/* create extents */
-#define BMAPI_ALLOCATE	(1 << 2)	/* delayed allocate to real extents */
-
-/* modifiers */
-#define BMAPI_IGNSTATE	(1 << 4)	/* ignore unwritten state on read */
-#define BMAPI_DIRECT	(1 << 5)	/* direct instead of buffered write */
-#define BMAPI_MMA	(1 << 6)	/* allocate for mmap write */
-#define BMAPI_TRYLOCK	(1 << 7)	/* non-blocking request */
-
-#define BMAPI_FLAGS \
-	{ BMAPI_READ,		"READ" }, \
-	{ BMAPI_WRITE,		"WRITE" }, \
-	{ BMAPI_ALLOCATE,	"ALLOCATE" }, \
-	{ BMAPI_IGNSTATE,	"IGNSTATE" }, \
-	{ BMAPI_DIRECT,		"DIRECT" }, \
-	{ BMAPI_TRYLOCK,	"TRYLOCK" }
-
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-		     struct xfs_bmbt_irec *, int *, int *);
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+			struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9..ae6fef1ff56 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 				xfs_buftarg_t	*log_target,
 				xfs_daddr_t	blk_offset,
 				int		num_bblks);
-STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int	 xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void	 xlog_dealloc_log(xlog_t *log);
 
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t		*log,
 				 xlog_ticket_t	*xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t	*mp,
+STATIC void xlog_grant_push_ail(struct log	*log,
 				int		need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t	 *log,
 					   xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 
 #if defined(DEBUG)
 STATIC void	xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void	xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void	xlog_verify_grant_tail(struct log *log);
 STATIC void	xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
 				  int count, boolean_t syncing);
 STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 				     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 
 STATIC int	xlog_iclogs_empty(xlog_t *log);
 
-
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+	struct log	*log,
+	atomic64_t	*head,
+	int		bytes)
 {
-	if (*qp) {
-		tic->t_next	    = (*qp);
-		tic->t_prev	    = (*qp)->t_prev;
-		(*qp)->t_prev->t_next = tic;
-		(*qp)->t_prev	    = tic;
-	} else {
-		tic->t_prev = tic->t_next = tic;
-		*qp = tic;
-	}
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
 
-	tic->t_flags |= XLOG_TIC_IN_Q;
-}
+	do {
+		int	cycle, space;
 
-static void
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-	if (tic == tic->t_next) {
-		*qp = NULL;
-	} else {
-		*qp = tic->t_next;
-		tic->t_next->t_prev = tic->t_prev;
-		tic->t_prev->t_next = tic->t_next;
-	}
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
 
-	tic->t_next = tic->t_prev = NULL;
-	tic->t_flags &= ~XLOG_TIC_IN_Q;
+		space -= bytes;
+		if (space < 0) {
+			space += log->l_logsize;
+			cycle--;
+		}
+
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
 }
 
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+	struct log	*log,
+	atomic64_t	*head,
+	int		bytes)
 {
-	log->l_grant_write_bytes -= bytes;
-	if (log->l_grant_write_bytes < 0) {
-		log->l_grant_write_bytes += log->l_logsize;
-		log->l_grant_write_cycle--;
-	}
-
-	log->l_grant_reserve_bytes -= bytes;
-	if ((log)->l_grant_reserve_bytes < 0) {
-		log->l_grant_reserve_bytes += log->l_logsize;
-		log->l_grant_reserve_cycle--;
-	}
+	int64_t	head_val = atomic64_read(head);
+	int64_t new, old;
 
-}
+	do {
+		int		tmp;
+		int		cycle, space;
 
-static void
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-	int tmp = log->l_logsize - log->l_grant_write_bytes;
-	if (tmp > bytes)
-		log->l_grant_write_bytes += bytes;
-	else {
-		log->l_grant_write_cycle++;
-		log->l_grant_write_bytes = bytes - tmp;
-	}
-}
+		xlog_crack_grant_head_val(head_val, &cycle, &space);
 
-static void
-xlog_grant_add_space_reserve(struct log *log, int bytes)
-{
-	int tmp = log->l_logsize - log->l_grant_reserve_bytes;
-	if (tmp > bytes)
-		log->l_grant_reserve_bytes += bytes;
-	else {
-		log->l_grant_reserve_cycle++;
-		log->l_grant_reserve_bytes = bytes - tmp;
-	}
-}
+		tmp = log->l_logsize - space;
+		if (tmp > bytes)
+			space += bytes;
+		else {
+			space = bytes - tmp;
+			cycle++;
+		}
 
-static inline void
-xlog_grant_add_space(struct log *log, int bytes)
-{
-	xlog_grant_add_space_write(log, bytes);
-	xlog_grant_add_space_reserve(log, bytes);
+		old = head_val;
+		new = xlog_assign_grant_head_val(cycle, space);
+		head_val = atomic64_cmpxchg(head, old, new);
+	} while (head_val != old);
 }
 
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
 
 		trace_xfs_log_reserve(log, internal_ticket);
 
-		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+		xlog_grant_push_ail(log, internal_ticket->t_unit_res);
 		retval = xlog_regrant_write_log_space(log, internal_ticket);
 	} else {
 		/* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
 
 		trace_xfs_log_reserve(log, internal_ticket);
 
-		xlog_grant_push_ail(mp,
+		xlog_grant_push_ail(log,
 				    (internal_ticket->t_unit_res *
 				     internal_ticket->t_cnt));
 		retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
 		cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
 	else {
 		cmn_err(CE_NOTE,
-			"!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+			"Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
 			mp->m_fsname);
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
 		      iclog->ic_state == XLOG_STATE_DIRTY)) {
 			if (!XLOG_FORCED_SHUTDOWN(log)) {
-				sv_wait(&iclog->ic_force_wait, PMEM,
-					&log->l_icloglock, s);
+				xlog_wait(&iclog->ic_force_wait,
+							&log->l_icloglock);
 			} else {
 				spin_unlock(&log->l_icloglock);
 			}
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 			|| iclog->ic_state == XLOG_STATE_DIRTY
 			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
 
-				sv_wait(&iclog->ic_force_wait, PMEM,
-					&log->l_icloglock, s);
+				xlog_wait(&iclog->ic_force_wait,
+							&log->l_icloglock);
 		} else {
 			spin_unlock(&log->l_icloglock);
 		}
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
 	xlog_ticket_t	*tic;
 	xlog_t		*log = mp->m_log;
-	int		need_bytes, free_bytes, cycle, bytes;
+	int		need_bytes, free_bytes;
 
 	if (XLOG_FORCED_SHUTDOWN(log))
 		return;
 
-	if (tail_lsn == 0) {
-		/* needed since sync_lsn is 64 bits */
-		spin_lock(&log->l_icloglock);
-		tail_lsn = log->l_last_sync_lsn;
-		spin_unlock(&log->l_icloglock);
-	}
-
-	spin_lock(&log->l_grant_lock);
+	if (tail_lsn == 0)
+		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
 
-	/* Also an invalid lsn.  1 implies that we aren't passing in a valid
-	 * tail_lsn.
-	 */
-	if (tail_lsn != 1) {
-		log->l_tail_lsn = tail_lsn;
-	}
+	/* tail_lsn == 1 implies that we weren't passed a valid value.  */
+	if (tail_lsn != 1)
+		atomic64_set(&log->l_tail_lsn, tail_lsn);
 
-	if ((tic = log->l_write_headq)) {
+	if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
 		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
 			panic("Recovery problem");
 #endif
-		cycle = log->l_grant_write_cycle;
-		bytes = log->l_grant_write_bytes;
-		free_bytes = xlog_space_left(log, cycle, bytes);
-		do {
+		spin_lock(&log->l_grant_write_lock);
+		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+		list_for_each_entry(tic, &log->l_writeq, t_queue) {
 			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
 
 			if (free_bytes < tic->t_unit_res && tail_lsn != 1)
 				break;
 			tail_lsn = 0;
 			free_bytes -= tic->t_unit_res;
-			sv_signal(&tic->t_wait);
-			tic = tic->t_next;
-		} while (tic != log->l_write_headq);
+			trace_xfs_log_regrant_write_wake_up(log, tic);
+			wake_up(&tic->t_wait);
+		}
+		spin_unlock(&log->l_grant_write_lock);
 	}
-	if ((tic = log->l_reserve_headq)) {
+
+	if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
 		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
 			panic("Recovery problem");
 #endif
-		cycle = log->l_grant_reserve_cycle;
-		bytes = log->l_grant_reserve_bytes;
-		free_bytes = xlog_space_left(log, cycle, bytes);
-		do {
+		spin_lock(&log->l_grant_reserve_lock);
+		free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+		list_for_each_entry(tic, &log->l_reserveq, t_queue) {
 			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
 				need_bytes = tic->t_unit_res*tic->t_cnt;
 			else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 				break;
 			tail_lsn = 0;
 			free_bytes -= need_bytes;
-			sv_signal(&tic->t_wait);
-			tic = tic->t_next;
-		} while (tic != log->l_reserve_headq);
+			trace_xfs_log_grant_wake_up(log, tic);
+			wake_up(&tic->t_wait);
+		}
+		spin_unlock(&log->l_grant_reserve_lock);
 	}
-	spin_unlock(&log->l_grant_lock);
-}	/* xfs_log_move_tail */
+}
 
 /*
  * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
  * We may be holding the log iclog lock upon entering this routine.
  */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+	struct xfs_mount	*mp)
 {
-	xfs_lsn_t tail_lsn;
-	xlog_t	  *log = mp->m_log;
+	xfs_lsn_t		tail_lsn;
+	struct log		*log = mp->m_log;
 
 	tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-	spin_lock(&log->l_grant_lock);
-	if (tail_lsn != 0) {
-		log->l_tail_lsn = tail_lsn;
-	} else {
-		tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-	}
-	spin_unlock(&log->l_grant_lock);
+	if (!tail_lsn)
+		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
 
+	atomic64_set(&log->l_tail_lsn, tail_lsn);
 	return tail_lsn;
-}	/* xlog_assign_tail_lsn */
-
+}
 
 /*
  * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
  * result is that we return the size of the log as the amount of space left.
  */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
-{
-	int free_bytes;
-	int tail_bytes;
-	int tail_cycle;
-
-	tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
-	tail_cycle = CYCLE_LSN(log->l_tail_lsn);
-	if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
-		free_bytes = log->l_logsize - (bytes - tail_bytes);
-	} else if ((tail_cycle + 1) < cycle) {
+xlog_space_left(
+	struct log	*log,
+	atomic64_t	*head)
+{
+	int		free_bytes;
+	int		tail_bytes;
+	int		tail_cycle;
+	int		head_cycle;
+	int		head_bytes;
+
+	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+	tail_bytes = BBTOB(tail_bytes);
+	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+		free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+	else if (tail_cycle + 1 < head_cycle)
 		return 0;
-	} else if (tail_cycle < cycle) {
-		ASSERT(tail_cycle == (cycle - 1));
-		free_bytes = tail_bytes - bytes;
+	else if (tail_cycle < head_cycle) {
+		ASSERT(tail_cycle == (head_cycle - 1));
+		free_bytes = tail_bytes - head_bytes;
 	} else {
 		/*
 		 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
 			"xlog_space_left: head behind tail\n"
 			"  tail_cycle = %d, tail_bytes = %d\n"
 			"  GH   cycle = %d, GH   bytes = %d",
-			tail_cycle, tail_bytes, cycle, bytes);
+			tail_cycle, tail_bytes, head_cycle, head_bytes);
 		ASSERT(0);
 		free_bytes = log->l_logsize;
 	}
 	return free_bytes;
-}	/* xlog_space_left */
+}
 
 
 /*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
 
 	log->l_prev_block  = -1;
-	log->l_tail_lsn	   = xlog_assign_lsn(1, 0);
 	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-	log->l_last_sync_lsn = log->l_tail_lsn;
+	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
 	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
-	log->l_grant_reserve_cycle = 1;
-	log->l_grant_write_cycle = 1;
+	xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
+	xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+	INIT_LIST_HEAD(&log->l_reserveq);
+	INIT_LIST_HEAD(&log->l_writeq);
+	spin_lock_init(&log->l_grant_reserve_lock);
+	spin_lock_init(&log->l_grant_write_lock);
 
 	error = EFSCORRUPTED;
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_xbuf = bp;
 
 	spin_lock_init(&log->l_icloglock);
-	spin_lock_init(&log->l_grant_lock);
-	sv_init(&log->l_flush_wait, 0, "flush_wait");
+	init_waitqueue_head(&log->l_flush_wait);
 
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 
 		ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
 		ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-		sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
-		sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+		init_waitqueue_head(&iclog->ic_force_wait);
+		init_waitqueue_head(&iclog->ic_write_wait);
 
 		iclogp = &iclog->ic_next;
 	}
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
 	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
 		prev_iclog = iclog->ic_next;
-		if (iclog->ic_bp) {
-			sv_destroy(&iclog->ic_force_wait);
-			sv_destroy(&iclog->ic_write_wait);
+		if (iclog->ic_bp)
 			xfs_buf_free(iclog->ic_bp);
-		}
 		kmem_free(iclog);
 	}
 	spinlock_destroy(&log->l_icloglock);
-	spinlock_destroy(&log->l_grant_lock);
 	xfs_buf_free(log->l_xbuf);
 out_free_log:
 	kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
  * water mark.  In this manner, we would be creating a low water mark.
  */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t	*mp,
-		    int		need_bytes)
+xlog_grant_push_ail(
+	struct log	*log,
+	int		need_bytes)
 {
-    xlog_t	*log = mp->m_log;	/* pointer to the log */
-    xfs_lsn_t	tail_lsn;		/* lsn of the log tail */
-    xfs_lsn_t	threshold_lsn = 0;	/* lsn we'd like to be at */
-    int		free_blocks;		/* free blocks left to write to */
-    int		free_bytes;		/* free bytes left to write to */
-    int		threshold_block;	/* block in lsn we'd like to be at */
-    int		threshold_cycle;	/* lsn cycle we'd like to be at */
-    int		free_threshold;
-
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-
-    spin_lock(&log->l_grant_lock);
-    free_bytes = xlog_space_left(log,
-				 log->l_grant_reserve_cycle,
-				 log->l_grant_reserve_bytes);
-    tail_lsn = log->l_tail_lsn;
-    free_blocks = BTOBBT(free_bytes);
-
-    /*
-     * Set the threshold for the minimum number of free blocks in the
-     * log to the maximum of what the caller needs, one quarter of the
-     * log, and 256 blocks.
-     */
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-    free_threshold = MAX(free_threshold, 256);
-    if (free_blocks < free_threshold) {
-	threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-	threshold_cycle = CYCLE_LSN(tail_lsn);
+	xfs_lsn_t	threshold_lsn = 0;
+	xfs_lsn_t	last_sync_lsn;
+	int		free_blocks;
+	int		free_bytes;
+	int		threshold_block;
+	int		threshold_cycle;
+	int		free_threshold;
+
+	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+	free_blocks = BTOBBT(free_bytes);
+
+	/*
+	 * Set the threshold for the minimum number of free blocks in the
+	 * log to the maximum of what the caller needs, one quarter of the
+	 * log, and 256 blocks.
+	 */
+	free_threshold = BTOBB(need_bytes);
+	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+	free_threshold = MAX(free_threshold, 256);
+	if (free_blocks >= free_threshold)
+		return;
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+						&threshold_block);
+	threshold_block += free_threshold;
 	if (threshold_block >= log->l_logBBsize) {
-	    threshold_block -= log->l_logBBsize;
-	    threshold_cycle += 1;
+		threshold_block -= log->l_logBBsize;
+		threshold_cycle += 1;
 	}
-	threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+	threshold_lsn = xlog_assign_lsn(threshold_cycle,
+					threshold_block);
+	/*
+	 * Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk. Use a snapshot of the last sync lsn
+	 * so that it doesn't change between the compare and the set.
+	 */
+	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+		threshold_lsn = last_sync_lsn;
 
-	/* Don't pass in an lsn greater than the lsn of the last
-	 * log record known to be on disk.
+	/*
+	 * Get the transaction layer to kick the dirty buffers out to
+	 * disk asynchronously. No point in trying to do this if
+	 * the filesystem is shutting down.
 	 */
-	if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
-	    threshold_lsn = log->l_last_sync_lsn;
-    }
-    spin_unlock(&log->l_grant_lock);
-
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-	!XLOG_FORCED_SHUTDOWN(log))
-	    xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}	/* xlog_grant_push_ail */
+	if (!XLOG_FORCED_SHUTDOWN(log))
+		xfs_trans_ail_push(log->l_ailp, threshold_lsn);
+}
 
 /*
  * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t		*log,
 		 roundoff < BBTOB(1)));
 
 	/* move grant heads by roundoff in sync */
-	spin_lock(&log->l_grant_lock);
-	xlog_grant_add_space(log, roundoff);
-	spin_unlock(&log->l_grant_lock);
+	xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
+	xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
 
 	/* put cycle number in every block */
 	xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
 
 	iclog = log->l_iclog;
 	for (i=0; i<log->l_iclog_bufs; i++) {
-		sv_destroy(&iclog->ic_force_wait);
-		sv_destroy(&iclog->ic_write_wait);
 		xfs_buf_free(iclog->ic_bp);
 		next_iclog = iclog->ic_next;
 		kmem_free(iclog);
 		iclog = next_iclog;
 	}
 	spinlock_destroy(&log->l_icloglock);
-	spinlock_destroy(&log->l_grant_lock);
 
 	xfs_buf_free(log->l_xbuf);
 	log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
 				lowest_lsn = xlog_get_lowest_lsn(log);
 				if (lowest_lsn &&
 				    XFS_LSN_CMP(lowest_lsn,
-				    		be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
 					iclog = iclog->ic_next;
 					continue; /* Leave this iclog for
 						   * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
 
 				iclog->ic_state = XLOG_STATE_CALLBACK;
 
-				spin_unlock(&log->l_icloglock);
 
-				/* l_last_sync_lsn field protected by
-				 * l_grant_lock. Don't worry about iclog's lsn.
-				 * No one else can be here except us.
+				/*
+				 * update the last_sync_lsn before we drop the
+				 * icloglock to ensure we are the only one that
+				 * can update it.
 				 */
-				spin_lock(&log->l_grant_lock);
-				ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
-				       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-				log->l_last_sync_lsn =
-					be64_to_cpu(iclog->ic_header.h_lsn);
-				spin_unlock(&log->l_grant_lock);
+				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+				atomic64_set(&log->l_last_sync_lsn,
+					be64_to_cpu(iclog->ic_header.h_lsn));
 
-			} else {
-				spin_unlock(&log->l_icloglock);
+			} else
 				ioerrors++;
-			}
+
+			spin_unlock(&log->l_icloglock);
 
 			/*
 			 * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
 			xlog_state_clean_log(log);
 
 			/* wake up threads waiting in xfs_log_force() */
-			sv_broadcast(&iclog->ic_force_wait);
+			wake_up_all(&iclog->ic_force_wait);
 
 			iclog = iclog->ic_next;
 		} while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
 	spin_unlock(&log->l_icloglock);
 
 	if (wake)
-		sv_broadcast(&log->l_flush_wait);
+		wake_up_all(&log->l_flush_wait);
 }
 
 
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
 	 * iclog buffer, we wake them all, one will get to do the
 	 * I/O, the others get to wait for the result.
 	 */
-	sv_broadcast(&iclog->ic_write_wait);
+	wake_up_all(&iclog->ic_write_wait);
 	spin_unlock(&log->l_icloglock);
 	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
 }	/* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
 		XFS_STATS_INC(xs_log_noiclogs);
 
 		/* Wait for log writes to have flushed */
-		sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
 		goto restart;
 	}
 
@@ -2527,6 +2486,18 @@ restart:
  *
  * Once a ticket gets put onto the reserveq, it will only return after
  * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
  */
 STATIC int
 xlog_grant_log_space(xlog_t	   *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
 	int		 free_bytes;
 	int		 need_bytes;
-#ifdef DEBUG
-	xfs_lsn_t	 tail_lsn;
-#endif
-
 
 #ifdef DEBUG
 	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
 		panic("grant Recovery problem");
 #endif
 
-	/* Is there space or do we need to sleep? */
-	spin_lock(&log->l_grant_lock);
-
 	trace_xfs_log_grant_enter(log, tic);
 
+	need_bytes = tic->t_unit_res;
+	if (tic->t_flags & XFS_LOG_PERM_RESERV)
+		need_bytes *= tic->t_ocnt;
+
 	/* something is already sleeping; insert new transaction at end */
-	if (log->l_reserve_headq) {
-		xlog_ins_ticketq(&log->l_reserve_headq, tic);
+	if (!list_empty_careful(&log->l_reserveq)) {
+		spin_lock(&log->l_grant_reserve_lock);
+		/* recheck the queue now we are locked */
+		if (list_empty(&log->l_reserveq)) {
+			spin_unlock(&log->l_grant_reserve_lock);
+			goto redo;
+		}
+		list_add_tail(&tic->t_queue, &log->l_reserveq);
 
 		trace_xfs_log_grant_sleep1(log, tic);
 
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t	   *log,
 			goto error_return;
 
 		XFS_STATS_INC(xs_sleep_logspace);
-		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
 		/*
 		 * If we got an error, and the filesystem is shutting down,
 		 * we'll catch it down below. So just continue...
 		 */
 		trace_xfs_log_grant_wake1(log, tic);
-		spin_lock(&log->l_grant_lock);
 	}
-	if (tic->t_flags & XFS_LOG_PERM_RESERV)
-		need_bytes = tic->t_unit_res*tic->t_ocnt;
-	else
-		need_bytes = tic->t_unit_res;
 
 redo:
 	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return;
+		goto error_return_unlocked;
 
-	free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
-				     log->l_grant_reserve_bytes);
+	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
 	if (free_bytes < need_bytes) {
-		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-			xlog_ins_ticketq(&log->l_reserve_headq, tic);
+		spin_lock(&log->l_grant_reserve_lock);
+		if (list_empty(&tic->t_queue))
+			list_add_tail(&tic->t_queue, &log->l_reserveq);
 
 		trace_xfs_log_grant_sleep2(log, tic);
 
-		spin_unlock(&log->l_grant_lock);
-		xlog_grant_push_ail(log->l_mp, need_bytes);
-		spin_lock(&log->l_grant_lock);
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-
-		spin_lock(&log->l_grant_lock);
 		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
 
-		trace_xfs_log_grant_wake2(log, tic);
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
 
+		trace_xfs_log_grant_wake2(log, tic);
 		goto redo;
-	} else if (tic->t_flags & XLOG_TIC_IN_Q)
-		xlog_del_ticketq(&log->l_reserve_headq, tic);
+	}
 
-	/* we've got enough space */
-	xlog_grant_add_space(log, need_bytes);
-#ifdef DEBUG
-	tail_lsn = log->l_tail_lsn;
-	/*
-	 * Check to make sure the grant write head didn't just over lap the
-	 * tail.  If the cycles are the same, we can't be overlapping.
-	 * Otherwise, make sure that the cycles differ by exactly one and
-	 * check the byte count.
-	 */
-	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+	if (!list_empty(&tic->t_queue)) {
+		spin_lock(&log->l_grant_reserve_lock);
+		list_del_init(&tic->t_queue);
+		spin_unlock(&log->l_grant_reserve_lock);
 	}
-#endif
+
+	/* we've got enough space */
+	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_grant_exit(log, tic);
-	xlog_verify_grant_head(log, 1);
-	spin_unlock(&log->l_grant_lock);
+	xlog_verify_grant_tail(log);
 	return 0;
 
- error_return:
-	if (tic->t_flags & XLOG_TIC_IN_Q)
-		xlog_del_ticketq(&log->l_reserve_headq, tic);
-
+error_return_unlocked:
+	spin_lock(&log->l_grant_reserve_lock);
+error_return:
+	list_del_init(&tic->t_queue);
+	spin_unlock(&log->l_grant_reserve_lock);
 	trace_xfs_log_grant_error(log, tic);
 
 	/*
@@ -2638,7 +2597,6 @@ redo:
 	 */
 	tic->t_curr_res = 0;
 	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-	spin_unlock(&log->l_grant_lock);
 	return XFS_ERROR(EIO);
 }	/* xlog_grant_log_space */
 
@@ -2646,17 +2604,14 @@ redo:
 /*
  * Replenish the byte reservation required by moving the grant write head.
  *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
  */
 STATIC int
 xlog_regrant_write_log_space(xlog_t	   *log,
 			     xlog_ticket_t *tic)
 {
 	int		free_bytes, need_bytes;
-	xlog_ticket_t	*ntic;
-#ifdef DEBUG
-	xfs_lsn_t	tail_lsn;
-#endif
 
 	tic->t_curr_res = tic->t_unit_res;
 	xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 		panic("regrant Recovery problem");
 #endif
 
-	spin_lock(&log->l_grant_lock);
-
 	trace_xfs_log_regrant_write_enter(log, tic);
-
 	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return;
+		goto error_return_unlocked;
 
 	/* If there are other waiters on the queue then give them a
 	 * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 	 * this transaction.
 	 */
 	need_bytes = tic->t_unit_res;
-	if ((ntic = log->l_write_headq)) {
-		free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
-					     log->l_grant_write_bytes);
-		do {
+	if (!list_empty_careful(&log->l_writeq)) {
+		struct xlog_ticket *ntic;
+
+		spin_lock(&log->l_grant_write_lock);
+		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
 			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
 
 			if (free_bytes < ntic->t_unit_res)
 				break;
 			free_bytes -= ntic->t_unit_res;
-			sv_signal(&ntic->t_wait);
-			ntic = ntic->t_next;
-		} while (ntic != log->l_write_headq);
-
-		if (ntic != log->l_write_headq) {
-			if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-				xlog_ins_ticketq(&log->l_write_headq, tic);
+			wake_up(&ntic->t_wait);
+		}
 
+		if (ntic != list_first_entry(&log->l_writeq,
+						struct xlog_ticket, t_queue)) {
+			if (list_empty(&tic->t_queue))
+				list_add_tail(&tic->t_queue, &log->l_writeq);
 			trace_xfs_log_regrant_write_sleep1(log, tic);
 
-			spin_unlock(&log->l_grant_lock);
-			xlog_grant_push_ail(log->l_mp, need_bytes);
-			spin_lock(&log->l_grant_lock);
+			xlog_grant_push_ail(log, need_bytes);
 
 			XFS_STATS_INC(xs_sleep_logspace);
-			sv_wait(&tic->t_wait, PINOD|PLTWAIT,
-				&log->l_grant_lock, s);
-
-			/* If we're shutting down, this tic is already
-			 * off the queue */
-			spin_lock(&log->l_grant_lock);
-			if (XLOG_FORCED_SHUTDOWN(log))
-				goto error_return;
-
+			xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
 			trace_xfs_log_regrant_write_wake1(log, tic);
-		}
+		} else
+			spin_unlock(&log->l_grant_write_lock);
 	}
 
 redo:
 	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return;
+		goto error_return_unlocked;
 
-	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
-				     log->l_grant_write_bytes);
+	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
 	if (free_bytes < need_bytes) {
-		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-			xlog_ins_ticketq(&log->l_write_headq, tic);
-		spin_unlock(&log->l_grant_lock);
-		xlog_grant_push_ail(log->l_mp, need_bytes);
-		spin_lock(&log->l_grant_lock);
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		trace_xfs_log_regrant_write_sleep2(log, tic);
-
-		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		spin_lock(&log->l_grant_write_lock);
+		if (list_empty(&tic->t_queue))
+			list_add_tail(&tic->t_queue, &log->l_writeq);
 
-		/* If we're shutting down, this tic is already off the queue */
-		spin_lock(&log->l_grant_lock);
 		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
 
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_regrant_write_sleep2(log, tic);
+		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+
 		trace_xfs_log_regrant_write_wake2(log, tic);
 		goto redo;
-	} else if (tic->t_flags & XLOG_TIC_IN_Q)
-		xlog_del_ticketq(&log->l_write_headq, tic);
+	}
 
-	/* we've got enough space */
-	xlog_grant_add_space_write(log, need_bytes);
-#ifdef DEBUG
-	tail_lsn = log->l_tail_lsn;
-	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+	if (!list_empty(&tic->t_queue)) {
+		spin_lock(&log->l_grant_write_lock);
+		list_del_init(&tic->t_queue);
+		spin_unlock(&log->l_grant_write_lock);
 	}
-#endif
 
+	/* we've got enough space */
+	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_regrant_write_exit(log, tic);
-
-	xlog_verify_grant_head(log, 1);
-	spin_unlock(&log->l_grant_lock);
+	xlog_verify_grant_tail(log);
 	return 0;
 
 
+ error_return_unlocked:
+	spin_lock(&log->l_grant_write_lock);
  error_return:
-	if (tic->t_flags & XLOG_TIC_IN_Q)
-		xlog_del_ticketq(&log->l_reserve_headq, tic);
-
+	list_del_init(&tic->t_queue);
+	spin_unlock(&log->l_grant_write_lock);
 	trace_xfs_log_regrant_write_error(log, tic);
 
 	/*
@@ -2778,7 +2714,6 @@ redo:
 	 */
 	tic->t_curr_res = 0;
 	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-	spin_unlock(&log->l_grant_lock);
 	return XFS_ERROR(EIO);
 }	/* xlog_regrant_write_log_space */
 
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 	if (ticket->t_cnt > 0)
 		ticket->t_cnt--;
 
-	spin_lock(&log->l_grant_lock);
-	xlog_grant_sub_space(log, ticket->t_curr_res);
+	xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+					ticket->t_curr_res);
+	xlog_grant_sub_space(log, &log->l_grant_write_head,
+					ticket->t_curr_res);
 	ticket->t_curr_res = ticket->t_unit_res;
 	xlog_tic_reset_res(ticket);
 
 	trace_xfs_log_regrant_reserve_sub(log, ticket);
 
-	xlog_verify_grant_head(log, 1);
-
 	/* just return if we still have some of the pre-reserved space */
-	if (ticket->t_cnt > 0) {
-		spin_unlock(&log->l_grant_lock);
+	if (ticket->t_cnt > 0)
 		return;
-	}
 
-	xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+	xlog_grant_add_space(log, &log->l_grant_reserve_head,
+					ticket->t_unit_res);
 
 	trace_xfs_log_regrant_reserve_exit(log, ticket);
 
-	xlog_verify_grant_head(log, 0);
-	spin_unlock(&log->l_grant_lock);
 	ticket->t_curr_res = ticket->t_unit_res;
 	xlog_tic_reset_res(ticket);
 }	/* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t	     *log,
 		       xlog_ticket_t *ticket)
 {
+	int	bytes;
+
 	if (ticket->t_cnt > 0)
 		ticket->t_cnt--;
 
-	spin_lock(&log->l_grant_lock);
 	trace_xfs_log_ungrant_enter(log, ticket);
-
-	xlog_grant_sub_space(log, ticket->t_curr_res);
-
 	trace_xfs_log_ungrant_sub(log, ticket);
 
-	/* If this is a permanent reservation ticket, we may be able to free
+	/*
+	 * If this is a permanent reservation ticket, we may be able to free
 	 * up more space based on the remaining count.
 	 */
+	bytes = ticket->t_curr_res;
 	if (ticket->t_cnt > 0) {
 		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-		xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+		bytes += ticket->t_unit_res*ticket->t_cnt;
 	}
 
+	xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+	xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+
 	trace_xfs_log_ungrant_exit(log, ticket);
 
-	xlog_verify_grant_head(log, 1);
-	spin_unlock(&log->l_grant_lock);
 	xfs_log_move_tail(log->l_mp, 1);
 }	/* xlog_ungrant_log_space */
 
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
 
 	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
 		/* update tail before writing to iclog */
-		xlog_assign_tail_lsn(log->l_mp);
+		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
 		sync++;
 		iclog->ic_state = XLOG_STATE_SYNCING;
-		iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
-		xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+		xlog_verify_tail_lsn(log, iclog, tail_lsn);
 		/* cycle incremented when incrementing curr_block */
 	}
 	spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
 			return XFS_ERROR(EIO);
 		}
 		XFS_STATS_INC(xs_log_force_sleep);
-		sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
 		/*
 		 * No need to grab the log lock here since we're
 		 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
 
 				XFS_STATS_INC(xs_log_force_sleep);
 
-				sv_wait(&iclog->ic_prev->ic_write_wait,
-					PSWP, &log->l_icloglock, s);
+				xlog_wait(&iclog->ic_prev->ic_write_wait,
+							&log->l_icloglock);
 				if (log_flushed)
 					*log_flushed = 1;
 				already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
 				return XFS_ERROR(EIO);
 			}
 			XFS_STATS_INC(xs_log_force_sleep);
-			sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
 			/*
 			 * No need to grab the log lock here since we're
 			 * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
 	xlog_ticket_t	*ticket)
 {
 	ASSERT(atomic_read(&ticket->t_ref) > 0);
-	if (atomic_dec_and_test(&ticket->t_ref)) {
-		sv_destroy(&ticket->t_wait);
+	if (atomic_dec_and_test(&ticket->t_ref))
 		kmem_zone_free(xfs_log_ticket_zone, ticket);
-	}
 }
 
 xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
         }
 
 	atomic_set(&tic->t_ref, 1);
+	INIT_LIST_HEAD(&tic->t_queue);
 	tic->t_unit_res		= unit_bytes;
 	tic->t_curr_res		= unit_bytes;
 	tic->t_cnt		= cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
 	tic->t_trans_type	= 0;
 	if (xflags & XFS_LOG_PERM_RESERV)
 		tic->t_flags |= XLOG_TIC_PERM_RESERV;
-	sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+	init_waitqueue_head(&tic->t_wait);
 
 	xlog_tic_reset_res(tic);
 
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+	struct log	*log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
-	if (equals)
-	    ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-	else
-	    ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
-    } else {
-	ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
-	ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
-    }
-}	/* xlog_verify_grant_head */
+	int		tail_cycle, tail_blocks;
+	int		cycle, space;
+
+	/*
+	 * Check to make sure the grant write head didn't just over lap the
+	 * tail.  If the cycles are the same, we can't be overlapping.
+	 * Otherwise, make sure that the cycles differ by exactly one and
+	 * check the byte count.
+	 */
+	xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+	if (tail_cycle != cycle) {
+		ASSERT(cycle - 1 == tail_cycle);
+		ASSERT(space <= BBTOB(tail_blocks));
+	}
+}
 
 /* check if it will fit */
 STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
 		xlog_cil_force(log);
 
 	/*
-	 * We must hold both the GRANT lock and the LOG lock,
-	 * before we mark the filesystem SHUTDOWN and wake
-	 * everybody up to tell the bad news.
+	 * mark the filesystem and the as in a shutdown state and wake
+	 * everybody up to tell them the bad news.
 	 */
 	spin_lock(&log->l_icloglock);
-	spin_lock(&log->l_grant_lock);
 	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
 	if (mp->m_sb_bp)
 		XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
 	spin_unlock(&log->l_icloglock);
 
 	/*
-	 * We don't want anybody waiting for log reservations
-	 * after this. That means we have to wake up everybody
-	 * queued up on reserve_headq as well as write_headq.
-	 * In addition, we make sure in xlog_{re}grant_log_space
-	 * that we don't enqueue anything once the SHUTDOWN flag
-	 * is set, and this action is protected by the GRANTLOCK.
+	 * We don't want anybody waiting for log reservations after this. That
+	 * means we have to wake up everybody queued up on reserveq as well as
+	 * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
+	 * we don't enqueue anything once the SHUTDOWN flag is set, and this
+	 * action is protected by the grant locks.
 	 */
-	if ((tic = log->l_reserve_headq)) {
-		do {
-			sv_signal(&tic->t_wait);
-			tic = tic->t_next;
-		} while (tic != log->l_reserve_headq);
-	}
-
-	if ((tic = log->l_write_headq)) {
-		do {
-			sv_signal(&tic->t_wait);
-			tic = tic->t_next;
-		} while (tic != log->l_write_headq);
-	}
-	spin_unlock(&log->l_grant_lock);
+	spin_lock(&log->l_grant_reserve_lock);
+	list_for_each_entry(tic, &log->l_reserveq, t_queue)
+		wake_up(&tic->t_wait);
+	spin_unlock(&log->l_grant_reserve_lock);
+
+	spin_lock(&log->l_grant_write_lock);
+	list_for_each_entry(tic, &log->l_writeq, t_queue)
+		wake_up(&tic->t_wait);
+	spin_unlock(&log->l_grant_write_lock);
 
 	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
 		ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97..9dc8125d04e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
 	INIT_LIST_HEAD(&cil->xc_committing);
 	spin_lock_init(&cil->xc_cil_lock);
 	init_rwsem(&cil->xc_ctx_lock);
-	sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+	init_waitqueue_head(&cil->xc_commit_wait);
 
 	INIT_LIST_HEAD(&ctx->committing);
 	INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
 	int	abort)
 {
 	struct xfs_cil_ctx	*ctx = args;
-	struct xfs_log_vec	*lv;
-	int			abortflag = abort ? XFS_LI_ABORTED : 0;
 	struct xfs_busy_extent	*busyp, *n;
 
-	/* unpin all the log items */
-	for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
-		xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-							abortflag);
-	}
+	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+					ctx->start_lsn, abort);
 
 	list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
 		xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
 			 * It is still being pushed! Wait for the push to
 			 * complete, then start again from the beginning.
 			 */
-			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
 			goto restart;
 		}
 	}
@@ -592,7 +587,7 @@ restart:
 	 */
 	spin_lock(&cil->xc_cil_lock);
 	ctx->commit_lsn = commit_lsn;
-	sv_broadcast(&cil->xc_commit_wait);
+	wake_up_all(&cil->xc_commit_wait);
 	spin_unlock(&cil->xc_cil_lock);
 
 	/* release the hounds! */
@@ -757,7 +752,7 @@ restart:
 			 * It is still being pushed! Wait for the push to
 			 * complete, then start again from the beginning.
 			 */
-			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+			xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
 			goto restart;
 		}
 		if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617..d5f8be8f4bf 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
 	BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
 	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 
-
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
 	return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
  */
 #define XLOG_TIC_INITED		0x1	/* has been initialized */
 #define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
-#define XLOG_TIC_IN_Q		0x4
 
 #define XLOG_TIC_FLAGS \
 	{ XLOG_TIC_INITED,	"XLOG_TIC_INITED" }, \
-	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }, \
-	{ XLOG_TIC_IN_Q,	"XLOG_TIC_IN_Q" }
+	{ XLOG_TIC_PERM_RESERV,	"XLOG_TIC_PERM_RESERV" }
 
 #endif	/* __KERNEL__ */
 
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 
 typedef struct xlog_ticket {
-	sv_t		   t_wait;	 /* ticket wait queue            : 20 */
-	struct xlog_ticket *t_next;	 /*			         :4|8 */
-	struct xlog_ticket *t_prev;	 /*				 :4|8 */
+	wait_queue_head_t  t_wait;	 /* ticket wait queue */
+	struct list_head   t_queue;	 /* reserve/write queue */
 	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
 	atomic_t	   t_ref;	 /* ticket reference count       : 4  */
 	int		   t_curr_res;	 /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
  * and move everything else out to subsequent cachelines.
  */
 typedef struct xlog_in_core {
-	sv_t			ic_force_wait;
-	sv_t			ic_write_wait;
+	wait_queue_head_t	ic_force_wait;
+	wait_queue_head_t	ic_write_wait;
 	struct xlog_in_core	*ic_next;
 	struct xlog_in_core	*ic_prev;
 	struct xfs_buf		*ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
 	struct xfs_cil_ctx	*xc_ctx;
 	struct rw_semaphore	xc_ctx_lock;
 	struct list_head	xc_committing;
-	sv_t			xc_commit_wait;
+	wait_queue_head_t	xc_commit_wait;
 	xfs_lsn_t		xc_current_sequence;
 };
 
@@ -491,7 +486,7 @@ typedef struct log {
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-	struct xfs_buf_cancel	**l_buf_cancel_table;
+	struct list_head	*l_buf_cancel_table;
 	int			l_iclog_hsize;  /* size of iclog header */
 	int			l_iclog_heads;  /* # of iclog header sectors */
 	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
 	int			l_logBBsize;    /* size of log in BB chunks */
 
 	/* The following block of fields are changed while holding icloglock */
-	sv_t			l_flush_wait ____cacheline_aligned_in_smp;
+	wait_queue_head_t	l_flush_wait ____cacheline_aligned_in_smp;
 						/* waiting for iclog flush */
 	int			l_covered_state;/* state of "covering disk
 						 * log entries" */
 	xlog_in_core_t		*l_iclog;       /* head log queue	*/
 	spinlock_t		l_icloglock;    /* grab to change iclog state */
-	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
-						 * buffers */
-	xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
 	int			l_curr_cycle;   /* Cycle number of log writes */
 	int			l_prev_cycle;   /* Cycle number before last
 						 * block increment */
 	int			l_curr_block;   /* current logical log block */
 	int			l_prev_block;   /* previous logical log block */
 
-	/* The following block of fields are changed while holding grant_lock */
-	spinlock_t		l_grant_lock ____cacheline_aligned_in_smp;
-	xlog_ticket_t		*l_reserve_headq;
-	xlog_ticket_t		*l_write_headq;
-	int			l_grant_reserve_cycle;
-	int			l_grant_reserve_bytes;
-	int			l_grant_write_cycle;
-	int			l_grant_write_bytes;
+	/*
+	 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+	 * read without needing to hold specific locks. To avoid operations
+	 * contending with other hot objects, place each of them on a separate
+	 * cacheline.
+	 */
+	/* lsn of last LR on disk */
+	atomic64_t		l_last_sync_lsn ____cacheline_aligned_in_smp;
+	/* lsn of 1st LR with unflushed * buffers */
+	atomic64_t		l_tail_lsn ____cacheline_aligned_in_smp;
+
+	/*
+	 * ticket grant locks, queues and accounting have their own cachlines
+	 * as these are quite hot and can be operated on concurrently.
+	 */
+	spinlock_t		l_grant_reserve_lock ____cacheline_aligned_in_smp;
+	struct list_head	l_reserveq;
+	atomic64_t		l_grant_reserve_head;
+
+	spinlock_t		l_grant_write_lock ____cacheline_aligned_in_smp;
+	struct list_head	l_writeq;
+	atomic64_t		l_grant_write_head;
 
 	/* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 
 } xlog_t;
 
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+	((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
 
 /* common routines */
@@ -562,6 +571,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
 				xlog_in_core_t **commit_iclog, uint flags);
 
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+	xfs_lsn_t val = atomic64_read(lsn);
+
+	*cycle = CYCLE_LSN(val);
+	*block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+	*cycle = val >> 32;
+	*space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+	return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
+/*
  * Committed Item List interfaces
  */
 int	xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
  */
 #define XLOG_UNMOUNT_REC_TYPE	(-1U)
 
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(wq, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+}
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458..aa0ebb77690 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+	xfs_daddr_t		bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct list_head	bc_list;
+};
+
+/*
  * Sector aligned buffer routines for buffer create/read/write/access
  */
 
@@ -925,12 +936,12 @@ xlog_find_tail(
 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 	if (found == 2)
 		log->l_curr_cycle++;
-	log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
-	log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
-	log->l_grant_reserve_cycle = log->l_curr_cycle;
-	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
-	log->l_grant_write_cycle = log->l_curr_cycle;
-	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+	xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+	xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
 
 	/*
 	 * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
 	}
 	after_umount_blk = (i + hblks + (int)
 		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-	tail_lsn = log->l_tail_lsn;
+	tail_lsn = atomic64_read(&log->l_tail_lsn);
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
 		umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
 			 * log records will point recovery to after the
 			 * current unmount record.
 			 */
-			log->l_tail_lsn =
-				xlog_assign_lsn(log->l_curr_cycle,
-						after_umount_blk);
-			log->l_last_sync_lsn =
-				xlog_assign_lsn(log->l_curr_cycle,
-						after_umount_blk);
+			xlog_assign_atomic_lsn(&log->l_tail_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+					log->l_curr_cycle, after_umount_blk);
 			*tail_blk = after_umount_blk;
 
 			/*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
  * record in the table to tell us how many times we expect to see this
  * record during the second pass.
  */
-STATIC void
-xlog_recover_do_buffer_pass1(
-	xlog_t			*log,
-	xfs_buf_log_format_t	*buf_f)
+STATIC int
+xlog_recover_buffer_pass1(
+	struct log		*log,
+	xlog_recover_item_t	*item)
 {
-	xfs_buf_cancel_t	*bcp;
-	xfs_buf_cancel_t	*nextp;
-	xfs_buf_cancel_t	*prevp;
-	xfs_buf_cancel_t	**bucket;
-	xfs_daddr_t		blkno = 0;
-	uint			len = 0;
-	ushort			flags = 0;
-
-	switch (buf_f->blf_type) {
-	case XFS_LI_BUF:
-		blkno = buf_f->blf_blkno;
-		len = buf_f->blf_len;
-		flags = buf_f->blf_flags;
-		break;
-	}
+	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
 
 	/*
 	 * If this isn't a cancel buffer item, then just return.
 	 */
-	if (!(flags & XFS_BLF_CANCEL)) {
+	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
 		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-		return;
-	}
-
-	/*
-	 * Insert an xfs_buf_cancel record into the hash table of
-	 * them.  If there is already an identical record, bump
-	 * its reference count.
-	 */
-	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-					  XLOG_BC_TABLE_SIZE];
-	/*
-	 * If the hash bucket is empty then just insert a new record into
-	 * the bucket.
-	 */
-	if (*bucket == NULL) {
-		bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-						     KM_SLEEP);
-		bcp->bc_blkno = blkno;
-		bcp->bc_len = len;
-		bcp->bc_refcount = 1;
-		bcp->bc_next = NULL;
-		*bucket = bcp;
-		return;
+		return 0;
 	}
 
 	/*
-	 * The hash bucket is not empty, so search for duplicates of our
-	 * record.  If we find one them just bump its refcount.  If not
-	 * then add us at the end of the list.
+	 * Insert an xfs_buf_cancel record into the hash table of them.
+	 * If there is already an identical record, bump its reference count.
 	 */
-	prevp = NULL;
-	nextp = *bucket;
-	while (nextp != NULL) {
-		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
-			nextp->bc_refcount++;
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == buf_f->blf_blkno &&
+		    bcp->bc_len == buf_f->blf_len) {
+			bcp->bc_refcount++;
 			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-			return;
+			return 0;
 		}
-		prevp = nextp;
-		nextp = nextp->bc_next;
-	}
-	ASSERT(prevp != NULL);
-	bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-					     KM_SLEEP);
-	bcp->bc_blkno = blkno;
-	bcp->bc_len = len;
+	}
+
+	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+	bcp->bc_blkno = buf_f->blf_blkno;
+	bcp->bc_len = buf_f->blf_len;
 	bcp->bc_refcount = 1;
-	bcp->bc_next = NULL;
-	prevp->bc_next = bcp;
+	list_add_tail(&bcp->bc_list, bucket);
+
 	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+	return 0;
 }
 
 /*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
  */
 STATIC int
 xlog_check_buffer_cancelled(
-	xlog_t			*log,
+	struct log		*log,
 	xfs_daddr_t		blkno,
 	uint			len,
 	ushort			flags)
 {
-	xfs_buf_cancel_t	*bcp;
-	xfs_buf_cancel_t	*prevp;
-	xfs_buf_cancel_t	**bucket;
+	struct list_head	*bucket;
+	struct xfs_buf_cancel	*bcp;
 
 	if (log->l_buf_cancel_table == NULL) {
 		/*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
 		return 0;
 	}
 
-	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-					  XLOG_BC_TABLE_SIZE];
-	bcp = *bucket;
-	if (bcp == NULL) {
-		/*
-		 * There is no corresponding entry in the table built
-		 * in pass one, so this buffer has not been cancelled.
-		 */
-		ASSERT(!(flags & XFS_BLF_CANCEL));
-		return 0;
-	}
-
 	/*
-	 * Search for an entry in the buffer cancel table that
-	 * matches our buffer.
+	 * Search for an entry in the  cancel table that matches our buffer.
 	 */
-	prevp = NULL;
-	while (bcp != NULL) {
-		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
-			/*
-			 * We've go a match, so return 1 so that the
-			 * recovery of this buffer is cancelled.
-			 * If this buffer is actually a buffer cancel
-			 * log item, then decrement the refcount on the
-			 * one in the table and remove it if this is the
-			 * last reference.
-			 */
-			if (flags & XFS_BLF_CANCEL) {
-				bcp->bc_refcount--;
-				if (bcp->bc_refcount == 0) {
-					if (prevp == NULL) {
-						*bucket = bcp->bc_next;
-					} else {
-						prevp->bc_next = bcp->bc_next;
-					}
-					kmem_free(bcp);
-				}
-			}
-			return 1;
-		}
-		prevp = bcp;
-		bcp = bcp->bc_next;
+	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+	list_for_each_entry(bcp, bucket, bc_list) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+			goto found;
 	}
+
 	/*
-	 * We didn't find a corresponding entry in the table, so
-	 * return 0 so that the buffer is NOT cancelled.
+	 * We didn't find a corresponding entry in the table, so return 0 so
+	 * that the buffer is NOT cancelled.
 	 */
 	ASSERT(!(flags & XFS_BLF_CANCEL));
 	return 0;
-}
 
-STATIC int
-xlog_recover_do_buffer_pass2(
-	xlog_t			*log,
-	xfs_buf_log_format_t	*buf_f)
-{
-	xfs_daddr_t		blkno = 0;
-	ushort			flags = 0;
-	uint			len = 0;
-
-	switch (buf_f->blf_type) {
-	case XFS_LI_BUF:
-		blkno = buf_f->blf_blkno;
-		flags = buf_f->blf_flags;
-		len = buf_f->blf_len;
-		break;
+found:
+	/*
+	 * We've go a match, so return 1 so that the recovery of this buffer
+	 * is cancelled.  If this buffer is actually a buffer cancel log
+	 * item, then decrement the refcount on the one in the table and
+	 * remove it if this is the last reference.
+	 */
+	if (flags & XFS_BLF_CANCEL) {
+		if (--bcp->bc_refcount == 0) {
+			list_del(&bcp->bc_list);
+			kmem_free(bcp);
+		}
 	}
-
-	return xlog_check_buffer_cancelled(log, blkno, len, flags);
+	return 1;
 }
 
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
- * the only data which should be recovered is that which corresponds
- * to the di_next_unlinked pointers in the on disk inode structures.
- * The rest of the data for the inodes is always logged through the
- * inodes themselves rather than the inode buffer and is recovered
- * in xlog_recover_do_inode_trans().
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
  *
- * The only time when buffers full of inodes are fully recovered is
- * when the buffer is full of newly allocated inodes.  In this case
- * the buffer will not be marked as an inode buffer and so will be
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes.  In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
  */
 STATIC int
 xlog_recover_do_inode_buffer(
-	xfs_mount_t		*mp,
+	struct xfs_mount	*mp,
 	xlog_recover_item_t	*item,
-	xfs_buf_t		*bp,
+	struct xfs_buf		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
-	int			item_index;
-	int			bit;
-	int			nbits;
-	int			reg_buf_offset;
-	int			reg_buf_bytes;
+	int			item_index = 0;
+	int			bit = 0;
+	int			nbits = 0;
+	int			reg_buf_offset = 0;
+	int			reg_buf_bytes = 0;
 	int			next_unlinked_offset;
 	int			inodes_per_buf;
 	xfs_agino_t		*logged_nextp;
 	xfs_agino_t		*buffer_nextp;
-	unsigned int		*data_map = NULL;
-	unsigned int		map_size = 0;
 
 	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
 
-	switch (buf_f->blf_type) {
-	case XFS_LI_BUF:
-		data_map = buf_f->blf_data_map;
-		map_size = buf_f->blf_map_size;
-		break;
-	}
-	/*
-	 * Set the variables corresponding to the current region to
-	 * 0 so that we'll initialize them on the first pass through
-	 * the loop.
-	 */
-	reg_buf_offset = 0;
-	reg_buf_bytes = 0;
-	bit = 0;
-	nbits = 0;
-	item_index = 0;
 	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < inodes_per_buf; i++) {
 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
 			 * the current di_next_unlinked field.
 			 */
 			bit += nbits;
-			bit = xfs_next_bit(data_map, map_size, bit);
+			bit = xfs_next_bit(buf_f->blf_data_map,
+					   buf_f->blf_map_size, bit);
 
 			/*
 			 * If there are no more logged regions in the
 			 * buffer, then we're done.
 			 */
-			if (bit == -1) {
+			if (bit == -1)
 				return 0;
-			}
 
-			nbits = xfs_contig_bits(data_map, map_size,
-							 bit);
+			nbits = xfs_contig_bits(buf_f->blf_data_map,
+						buf_f->blf_map_size, bit);
 			ASSERT(nbits > 0);
 			reg_buf_offset = bit << XFS_BLF_SHIFT;
 			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
 		 * di_next_unlinked field, then move on to the next
 		 * di_next_unlinked field.
 		 */
-		if (next_unlinked_offset < reg_buf_offset) {
+		if (next_unlinked_offset < reg_buf_offset)
 			continue;
-		}
 
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
 		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
  * given buffer.  The bitmap in the buf log format structure indicates
  * where to place the logged data.
  */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
 	struct xfs_mount	*mp,
 	xlog_recover_item_t	*item,
-	xfs_buf_t		*bp,
+	struct xfs_buf		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
 	int			bit;
 	int			nbits;
-	unsigned int		*data_map = NULL;
-	unsigned int		map_size = 0;
 	int                     error;
 
 	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
 
-	switch (buf_f->blf_type) {
-	case XFS_LI_BUF:
-		data_map = buf_f->blf_data_map;
-		map_size = buf_f->blf_map_size;
-		break;
-	}
 	bit = 0;
 	i = 1;  /* 0 is the buf format structure */
 	while (1) {
-		bit = xfs_next_bit(data_map, map_size, bit);
+		bit = xfs_next_bit(buf_f->blf_data_map,
+				   buf_f->blf_map_size, bit);
 		if (bit == -1)
 			break;
-		nbits = xfs_contig_bits(data_map, map_size, bit);
+		nbits = xfs_contig_bits(buf_f->blf_data_map,
+					buf_f->blf_map_size, bit);
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
 		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
 	xlog_t			*log,
-	xlog_recover_item_t	*item,
-	int			pass)
+	xlog_recover_item_t	*item)
 {
 	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
-	xfs_mount_t		*mp;
+	xfs_mount_t		*mp = log->l_mp;
 	xfs_buf_t		*bp;
 	int			error;
-	int			cancel;
-	xfs_daddr_t		blkno;
-	int			len;
-	ushort			flags;
 	uint			buf_flags;
 
-	if (pass == XLOG_RECOVER_PASS1) {
-		/*
-		 * In this pass we're only looking for buf items
-		 * with the XFS_BLF_CANCEL bit set.
-		 */
-		xlog_recover_do_buffer_pass1(log, buf_f);
+	/*
+	 * In this pass we only want to recover all the buffers which have
+	 * not been cancelled and are not cancellation buffers themselves.
+	 */
+	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+			buf_f->blf_len, buf_f->blf_flags)) {
+		trace_xfs_log_recover_buf_cancel(log, buf_f);
 		return 0;
-	} else {
-		/*
-		 * In this pass we want to recover all the buffers
-		 * which have not been cancelled and are not
-		 * cancellation buffers themselves.  The routine
-		 * we call here will tell us whether or not to
-		 * continue with the replay of this buffer.
-		 */
-		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-		if (cancel) {
-			trace_xfs_log_recover_buf_cancel(log, buf_f);
-			return 0;
-		}
 	}
+
 	trace_xfs_log_recover_buf_recover(log, buf_f);
-	switch (buf_f->blf_type) {
-	case XFS_LI_BUF:
-		blkno = buf_f->blf_blkno;
-		len = buf_f->blf_len;
-		flags = buf_f->blf_flags;
-		break;
-	default:
-		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-			"xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-			buf_f->blf_type, log->l_mp->m_logname ?
-			log->l_mp->m_logname : "internal");
-		XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-				 XFS_ERRLEVEL_LOW, log->l_mp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
 
-	mp = log->l_mp;
 	buf_flags = XBF_LOCK;
-	if (!(flags & XFS_BLF_INODE_BUF))
+	if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
 		buf_flags |= XBF_MAPPED;
 
-	bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+			  buf_flags);
 	if (XFS_BUF_ISERROR(bp)) {
-		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
-				  bp, blkno);
+		xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
+				  bp, buf_f->blf_blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
 		return error;
 	}
 
 	error = 0;
-	if (flags & XFS_BLF_INODE_BUF) {
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-	} else if (flags &
+	} else if (buf_f->blf_flags &
 		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
 	xlog_t			*log,
-	xlog_recover_item_t	*item,
-	int			pass)
+	xlog_recover_item_t	*item)
 {
 	xfs_inode_log_format_t	*in_f;
-	xfs_mount_t		*mp;
+	xfs_mount_t		*mp = log->l_mp;
 	xfs_buf_t		*bp;
 	xfs_dinode_t		*dip;
-	xfs_ino_t		ino;
 	int			len;
 	xfs_caddr_t		src;
 	xfs_caddr_t		dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
 	xfs_icdinode_t		*dicp;
 	int			need_free = 0;
 
-	if (pass == XLOG_RECOVER_PASS1) {
-		return 0;
-	}
-
 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
 		in_f = item->ri_buf[0].i_addr;
 	} else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
 		if (error)
 			goto error;
 	}
-	ino = in_f->ilf_ino;
-	mp = log->l_mp;
 
 	/*
 	 * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-			dip, bp, ino);
-		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+			dip, bp, in_f->ilf_ino);
+		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-			item, ino);
-		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+			item, in_f->ilf_ino);
+		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
 	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
 					 XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-				item, dip, bp, ino);
+				item, dip, bp, in_f->ilf_ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
 		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
 					     XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-				item, dip, bp, ino);
+				item, dip, bp, in_f->ilf_ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-			item, dip, bp, ino,
+			item, dip, bp, in_f->ilf_ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-			item, dip, bp, ino, dicp->di_forkoff);
+			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
 			break;
 
 		default:
-			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+			xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
 			ASSERT(0);
 			xfs_buf_relse(bp);
 			error = EIO;
@@ -2556,18 +2422,11 @@ error:
  * of that type.
  */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
 	xlog_t			*log,
-	xlog_recover_item_t	*item,
-	int			pass)
+	xlog_recover_item_t	*item)
 {
-	xfs_qoff_logformat_t	*qoff_f;
-
-	if (pass == XLOG_RECOVER_PASS2) {
-		return (0);
-	}
-
-	qoff_f = item->ri_buf[0].i_addr;
+	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
 	ASSERT(qoff_f);
 
 	/*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
  * Recover a dquot record
  */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
 	xlog_t			*log,
-	xlog_recover_item_t	*item,
-	int			pass)
+	xlog_recover_item_t	*item)
 {
-	xfs_mount_t		*mp;
+	xfs_mount_t		*mp = log->l_mp;
 	xfs_buf_t		*bp;
 	struct xfs_disk_dquot	*ddq, *recddq;
 	int			error;
 	xfs_dq_logformat_t	*dq_f;
 	uint			type;
 
-	if (pass == XLOG_RECOVER_PASS1) {
-		return 0;
-	}
-	mp = log->l_mp;
 
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
 	if ((error = xfs_qm_dqcheck(recddq,
 			   dq_f->qlf_id,
 			   0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_do_dquot_trans (log copy)"))) {
+			   "xlog_recover_dquot_pass2 (log copy)"))) {
 		return XFS_ERROR(EIO);
 	}
 	ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
 	 * minimal initialization then.
 	 */
 	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_do_dquot_trans")) {
+			   "xlog_recover_dquot_pass2")) {
 		xfs_buf_relse(bp);
 		return XFS_ERROR(EIO);
 	}
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
  * LSN.
  */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
-	xfs_lsn_t		lsn,
-	int			pass)
+	xfs_lsn_t		lsn)
 {
 	int			error;
-	xfs_mount_t		*mp;
+	xfs_mount_t		*mp = log->l_mp;
 	xfs_efi_log_item_t	*efip;
 	xfs_efi_log_format_t	*efi_formatp;
 
-	if (pass == XLOG_RECOVER_PASS1) {
-		return 0;
-	}
-
 	efi_formatp = item->ri_buf[0].i_addr;
 
-	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
 	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
 					 &(efip->efi_format)))) {
 		xfs_efi_item_free(efip);
 		return error;
 	}
-	efip->efi_next_extent = efi_formatp->efi_nextents;
-	efip->efi_flags |= XFS_EFI_COMMITTED;
+	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
 
 	spin_lock(&log->l_ailp->xa_lock);
 	/*
 	 * xfs_trans_ail_update() drops the AIL lock.
 	 */
-	xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
 	return 0;
 }
 
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
  * efd format structure.  If we find it, we remove the efi from the
  * AIL and free it.
  */
-STATIC void
-xlog_recover_do_efd_trans(
+STATIC int
+xlog_recover_efd_pass2(
 	xlog_t			*log,
-	xlog_recover_item_t	*item,
-	int			pass)
+	xlog_recover_item_t	*item)
 {
 	xfs_efd_log_format_t	*efd_formatp;
 	xfs_efi_log_item_t	*efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp = log->l_ailp;
 
-	if (pass == XLOG_RECOVER_PASS1) {
-		return;
-	}
-
 	efd_formatp = item->ri_buf[0].i_addr;
 	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
 	}
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
-}
-
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-	xlog_t			*log,
-	xlog_recover_t		*trans,
-	int			pass)
-{
-	int			error = 0;
-	xlog_recover_item_t	*item;
-
-	error = xlog_recover_reorder_trans(log, trans, pass);
-	if (error)
-		return error;
-
-	list_for_each_entry(item, &trans->r_itemq, ri_list) {
-		trace_xfs_log_recover_item_recover(log, trans, item, pass);
-		switch (ITEM_TYPE(item)) {
-		case XFS_LI_BUF:
-			error = xlog_recover_do_buffer_trans(log, item, pass);
-			break;
-		case XFS_LI_INODE:
-			error = xlog_recover_do_inode_trans(log, item, pass);
-			break;
-		case XFS_LI_EFI:
-			error = xlog_recover_do_efi_trans(log, item,
-							  trans->r_lsn, pass);
-			break;
-		case XFS_LI_EFD:
-			xlog_recover_do_efd_trans(log, item, pass);
-			error = 0;
-			break;
-		case XFS_LI_DQUOT:
-			error = xlog_recover_do_dquot_trans(log, item, pass);
-			break;
-		case XFS_LI_QUOTAOFF:
-			error = xlog_recover_do_quotaoff_trans(log, item,
-							       pass);
-			break;
-		default:
-			xlog_warn(
-	"XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-			ASSERT(0);
-			error = XFS_ERROR(EIO);
-			break;
-		}
-
-		if (error)
-			return error;
-	}
 
 	return 0;
 }
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
  */
 STATIC void
 xlog_recover_free_trans(
-	xlog_recover_t		*trans)
+	struct xlog_recover	*trans)
 {
 	xlog_recover_item_t	*item, *n;
 	int			i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
 }
 
 STATIC int
+xlog_recover_commit_pass1(
+	struct log		*log,
+	struct xlog_recover	*trans,
+	xlog_recover_item_t	*item)
+{
+	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		return xlog_recover_buffer_pass1(log, item);
+	case XFS_LI_QUOTAOFF:
+		return xlog_recover_quotaoff_pass1(log, item);
+	case XFS_LI_INODE:
+	case XFS_LI_EFI:
+	case XFS_LI_EFD:
+	case XFS_LI_DQUOT:
+		/* nothing to do in pass 1 */
+		return 0;
+	default:
+		xlog_warn(
+	"XFS: invalid item type (%d) xlog_recover_commit_pass1",
+			ITEM_TYPE(item));
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+	struct log		*log,
+	struct xlog_recover	*trans,
+	xlog_recover_item_t	*item)
+{
+	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+	switch (ITEM_TYPE(item)) {
+	case XFS_LI_BUF:
+		return xlog_recover_buffer_pass2(log, item);
+	case XFS_LI_INODE:
+		return xlog_recover_inode_pass2(log, item);
+	case XFS_LI_EFI:
+		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+	case XFS_LI_EFD:
+		return xlog_recover_efd_pass2(log, item);
+	case XFS_LI_DQUOT:
+		return xlog_recover_dquot_pass2(log, item);
+	case XFS_LI_QUOTAOFF:
+		/* nothing to do in pass2 */
+		return 0;
+	default:
+		xlog_warn(
+	"XFS: invalid item type (%d) xlog_recover_commit_pass2",
+			ITEM_TYPE(item));
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-	xlog_t			*log,
-	xlog_recover_t		*trans,
+	struct log		*log,
+	struct xlog_recover	*trans,
 	int			pass)
 {
-	int			error;
+	int			error = 0;
+	xlog_recover_item_t	*item;
 
 	hlist_del(&trans->r_list);
-	if ((error = xlog_recover_do_trans(log, trans, pass)))
+
+	error = xlog_recover_reorder_trans(log, trans, pass);
+	if (error)
 		return error;
-	xlog_recover_free_trans(trans);			/* no error */
+
+	list_for_each_entry(item, &trans->r_itemq, ri_list) {
+		if (pass == XLOG_RECOVER_PASS1)
+			error = xlog_recover_commit_pass1(log, trans, item);
+		else
+			error = xlog_recover_commit_pass2(log, trans, item);
+		if (error)
+			return error;
+	}
+
+	xlog_recover_free_trans(trans);
 	return 0;
 }
 
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
 
-	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
 
 	/*
 	 * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
 					 extp->ext_len);
 	}
 
-	efip->efi_flags |= XFS_EFI_RECOVERED;
+	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
 	error = xfs_trans_commit(tp, 0);
 	return error;
 
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
 		 * Skip EFIs that we've already processed.
 		 */
 		efip = (xfs_efi_log_item_t *)lip;
-		if (efip->efi_flags & XFS_EFI_RECOVERED) {
+		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
 			lip = xfs_trans_ail_cursor_next(ailp, &cur);
 			continue;
 		}
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
-	int		error;
+	int		error, i;
 
 	ASSERT(head_blk != tail_blk);
 
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
 	 * First do a pass to find all of the cancelled buf log items.
 	 * Store them in the buf_cancel_table for use in the second pass.
 	 */
-	log->l_buf_cancel_table =
-		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
-						 sizeof(xfs_buf_cancel_t*),
+	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(struct list_head),
 						 KM_SLEEP);
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1);
 	if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
 		int	i;
 
 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-			ASSERT(log->l_buf_cancel_table[i] == NULL);
+			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
 	}
 #endif	/* DEBUG */
 
@@ -3934,7 +3800,7 @@ xlog_recover_finish(
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
 		cmn_err(CE_DEBUG,
-			"!Ending clean XFS mount for filesystem: %s\n",
+			"Ending clean XFS mount for filesystem: %s\n",
 			log->l_mp->m_fsname);
 	}
 	return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c25..d447aef84bc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
 			goto out_unwind;
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
-		rwlock_init(&pag->pag_ici_lock);
+		spin_lock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+	struct xfs_mount	*mp)
+{
+	int i;
+
+	for (i = 0; i < XFS_LOWSP_MAX; i++) {
+		__uint64_t space = mp->m_sb.sb_dblocks;
+
+		do_div(space, 100);
+		mp->m_low_space[i] = space * (i + 1);
+	}
+}
+
+
+/*
  * Set whether we're using inode alignment.
  */
 STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
 	 */
 	xfs_set_rw_sizes(mp);
 
+	/* set the low space thresholds for dynamic preallocation */
+	xfs_set_low_space_thresholds(mp);
+
 	/*
 	 * Set the inode cluster size.
 	 * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b498074..a62e8971539 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
 	xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
 
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+	XFS_LOWSP_1_PCNT = 0,
+	XFS_LOWSP_2_PCNT,
+	XFS_LOWSP_3_PCNT,
+	XFS_LOWSP_4_PCNT,
+	XFS_LOWSP_5_PCNT,
+	XFS_LOWSP_MAX,
+};
+
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
+	int64_t			m_low_space[XFS_LOWSP_MAX];
+						/* low free space thresholds */
 } xfs_mount_t;
 
 /*
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
 extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 
+extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
+
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711..33dbc4e0ad6 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
 	if (blkdelta)
 		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
 out:
-	ASSERT(error = 0);
+	ASSERT(error == 0);
 	return;
 }
 
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
  * they could be immediately flushed and we'd have to race with the flusher
  * trying to pull the item from the AIL as we add it.
  */
-void
+static void
 xfs_trans_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
 	xfs_trans_free(tp);
 }
 
+static inline void
+xfs_log_item_batch_insert(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		commit_lsn)
+{
+	int	i;
+
+	spin_lock(&ailp->xa_lock);
+	/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+	xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+
+	for (i = 0; i < nr_items; i++)
+		IOP_UNPIN(log_items[i], 0);
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ */
+void
+xfs_trans_committed_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		commit_lsn,
+	int			aborted)
+{
+#define LOG_ITEM_BATCH_SIZE	32
+	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
+	struct xfs_log_vec	*lv;
+	int			i = 0;
+
+	/* unpin all the log items */
+	for (lv = log_vector; lv; lv = lv->lv_next ) {
+		struct xfs_log_item	*lip = lv->lv_item;
+		xfs_lsn_t		item_lsn;
+
+		if (aborted)
+			lip->li_flags |= XFS_LI_ABORTED;
+		item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+		/* item_lsn of -1 means the item was freed */
+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+			continue;
+
+		if (item_lsn != commit_lsn) {
+
+			/*
+			 * Not a bulk update option due to unusual item_lsn.
+			 * Push into AIL immediately, rechecking the lsn once
+			 * we have the ail lock. Then unpin the item.
+			 */
+			spin_lock(&ailp->xa_lock);
+			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+				xfs_trans_ail_update(ailp, lip, item_lsn);
+			else
+				spin_unlock(&ailp->xa_lock);
+			IOP_UNPIN(lip, 0);
+			continue;
+		}
+
+		/* Item is a candidate for bulk AIL insert.  */
+		log_items[i++] = lv->lv_item;
+		if (i >= LOG_ITEM_BATCH_SIZE) {
+			xfs_log_item_batch_insert(ailp, log_items,
+					LOG_ITEM_BATCH_SIZE, commit_lsn);
+			i = 0;
+		}
+	}
+
+	/* make sure we insert the remainder! */
+	if (i)
+		xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
+
 /*
  * Called from the trans_commit code when we notice that
  * the filesystem is in the middle of a forced shutdown.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a8..c2042b736b8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define	XFS_ALLOC_BTREE_REF	2
 #define	XFS_BMAP_BTREE_REF	2
 #define	XFS_DIR_BTREE_REF	2
+#define	XFS_INO_REF		2
 #define	XFS_ATTR_BTREE_REF	1
-#define	XFS_INO_REF		1
 #define	XFS_DQUOT_REF		1
 
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff..c5bbbc45db9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
 		xfs_log_move_tail(ailp->xa_mount, 1);
 }	/* xfs_trans_unlocked_item */
 
-
 /*
- * Update the position of the item in the AIL with the new
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
- * it to its new position by removing it and re-adding it.
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
  *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
- * we move in the AIL is the minimum one, update the tail lsn in the
- * log manager.
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
  *
- * This function must be called with the AIL lock held.  The lock
- * is dropped before returning.
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
  */
 void
-xfs_trans_ail_update(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip,
-	xfs_lsn_t	lsn) __releases(ailp->xa_lock)
+xfs_trans_ail_update_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	**log_items,
+	int			nr_items,
+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
 {
-	xfs_log_item_t		*dlip = NULL;
-	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
+	xfs_log_item_t		*mlip;
 	xfs_lsn_t		tail_lsn;
+	int			mlip_changed = 0;
+	int			i;
+	LIST_HEAD(tmp);
 
 	mlip = xfs_ail_min(ailp);
 
-	if (lip->li_flags & XFS_LI_IN_AIL) {
-		dlip = xfs_ail_delete(ailp, lip);
-		ASSERT(dlip == lip);
-		xfs_trans_ail_cursor_clear(ailp, dlip);
-	} else {
-		lip->li_flags |= XFS_LI_IN_AIL;
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (lip->li_flags & XFS_LI_IN_AIL) {
+			/* check if we really need to move the item */
+			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+				continue;
+
+			xfs_ail_delete(ailp, lip);
+			if (mlip == lip)
+				mlip_changed = 1;
+		} else {
+			lip->li_flags |= XFS_LI_IN_AIL;
+		}
+		lip->li_lsn = lsn;
+		list_add(&lip->li_ail, &tmp);
 	}
 
-	lip->li_lsn = lsn;
-	xfs_ail_insert(ailp, lip);
+	xfs_ail_splice(ailp, &tmp, lsn);
 
-	if (mlip == dlip) {
-		mlip = xfs_ail_min(ailp);
-		/*
-		 * It is not safe to access mlip after the AIL lock is
-		 * dropped, so we must get a copy of li_lsn before we do
-		 * so.  This is especially important on 32-bit platforms
-		 * where accessing and updating 64-bit values like li_lsn
-		 * is not atomic.
-		 */
-		tail_lsn = mlip->li_lsn;
-		spin_unlock(&ailp->xa_lock);
-		xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-	} else {
+	if (!mlip_changed) {
 		spin_unlock(&ailp->xa_lock);
+		return;
 	}
 
-
-}	/* xfs_trans_update_ail */
+	/*
+	 * It is not safe to access mlip after the AIL lock is dropped, so we
+	 * must get a copy of li_lsn before we do so.  This is especially
+	 * important on 32-bit platforms where accessing and updating 64-bit
+	 * values like li_lsn is not atomic.
+	 */
+	mlip = xfs_ail_min(ailp);
+	tail_lsn = mlip->li_lsn;
+	spin_unlock(&ailp->xa_lock);
+	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 
 /*
- * Delete the given item from the AIL.  It must already be in
- * the AIL.
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
  *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
- * we delete in the AIL is the minimum one, update the tail lsn in the
- * log manager.
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
  *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
- * bump the AIL's generation count to indicate that the tree
- * has changed.
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
+ * flag from the item and reset the item's lsn to 0. If we remove the first
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
  *
- * This function must be called with the AIL lock held.  The lock
- * is dropped before returning.
+ * This function will not drop the AIL lock until all items are removed from
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
  */
 void
-xfs_trans_ail_delete(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
+xfs_trans_ail_delete_bulk(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	**log_items,
+	int			nr_items) __releases(ailp->xa_lock)
 {
-	xfs_log_item_t		*dlip;
 	xfs_log_item_t		*mlip;
 	xfs_lsn_t		tail_lsn;
+	int			mlip_changed = 0;
+	int			i;
 
-	if (lip->li_flags & XFS_LI_IN_AIL) {
-		mlip = xfs_ail_min(ailp);
-		dlip = xfs_ail_delete(ailp, lip);
-		ASSERT(dlip == lip);
-		xfs_trans_ail_cursor_clear(ailp, dlip);
-
+	mlip = xfs_ail_min(ailp);
 
-		lip->li_flags &= ~XFS_LI_IN_AIL;
-		lip->li_lsn = 0;
+	for (i = 0; i < nr_items; i++) {
+		struct xfs_log_item *lip = log_items[i];
+		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+			struct xfs_mount	*mp = ailp->xa_mount;
 
-		if (mlip == dlip) {
-			mlip = xfs_ail_min(ailp);
-			/*
-			 * It is not safe to access mlip after the AIL lock
-			 * is dropped, so we must get a copy of li_lsn
-			 * before we do so.  This is especially important
-			 * on 32-bit platforms where accessing and updating
-			 * 64-bit values like li_lsn is not atomic.
-			 */
-			tail_lsn = mlip ? mlip->li_lsn : 0;
-			spin_unlock(&ailp->xa_lock);
-			xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-		} else {
 			spin_unlock(&ailp->xa_lock);
+			if (!XFS_FORCED_SHUTDOWN(mp)) {
+				xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+		"%s: attempting to delete a log item that is not in the AIL",
+						__func__);
+				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			}
+			return;
 		}
+
+		xfs_ail_delete(ailp, lip);
+		lip->li_flags &= ~XFS_LI_IN_AIL;
+		lip->li_lsn = 0;
+		if (mlip == lip)
+			mlip_changed = 1;
 	}
-	else {
-		/*
-		 * If the file system is not being shutdown, we are in
-		 * serious trouble if we get to this stage.
-		 */
-		struct xfs_mount	*mp = ailp->xa_mount;
 
+	if (!mlip_changed) {
 		spin_unlock(&ailp->xa_lock);
-		if (!XFS_FORCED_SHUTDOWN(mp)) {
-			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-		"%s: attempting to delete a log item that is not in the AIL",
-					__func__);
-			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-		}
+		return;
 	}
-}
-
 
+	/*
+	 * It is not safe to access mlip after the AIL lock is dropped, so we
+	 * must get a copy of li_lsn before we do so.  This is especially
+	 * important on 32-bit platforms where accessing and updating 64-bit
+	 * values like li_lsn is not atomic. It is possible we've emptied the
+	 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+	 */
+	mlip = xfs_ail_min(ailp);
+	tail_lsn = mlip ? mlip->li_lsn : 0;
+	spin_unlock(&ailp->xa_lock);
+	xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 
 /*
  * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 
 /*
- * Insert the given log item into the AIL.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
+ * splice the log item list into the AIL at the given LSN.
  */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
 	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip)
-/* ARGSUSED */
+	struct list_head *list,
+	xfs_lsn_t	lsn)
 {
 	xfs_log_item_t	*next_lip;
 
@@ -640,39 +660,33 @@ xfs_ail_insert(
 	 * If the list is empty, just insert the item.
 	 */
 	if (list_empty(&ailp->xa_ail)) {
-		list_add(&lip->li_ail, &ailp->xa_ail);
+		list_splice(list, &ailp->xa_ail);
 		return;
 	}
 
 	list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-		if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+		if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
 			break;
 	}
 
 	ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-
-	list_add(&lip->li_ail, &next_lip->li_ail);
+	       (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
 
-	xfs_ail_check(ailp, lip);
+	list_splice_init(list, &next_lip->li_ail);
 	return;
 }
 
 /*
  * Delete the given item from the AIL.  Return a pointer to the item.
  */
-/*ARGSUSED*/
-STATIC xfs_log_item_t *
+STATIC void
 xfs_ail_delete(
 	struct xfs_ail	*ailp,
 	xfs_log_item_t	*lip)
-/* ARGSUSED */
 {
 	xfs_ail_check(ailp, lip);
-
 	list_del(&lip->li_ail);
-
-	return lip;
+	xfs_trans_ail_cursor_clear(ailp, lip);
 }
 
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
 	struct xfs_ail	*ailp)
-/* ARGSUSED */
 {
 	if (list_empty(&ailp->xa_ail))
 		return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
 	struct xfs_ail	*ailp,
 	xfs_log_item_t	*lip)
-/* ARGSUSED */
 {
 	if (lip->li_ail.next == &ailp->xa_ail)
 		return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa7..f7590f5bade 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
-	next_extent = efip->efi_next_extent;
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
 	ASSERT(next_extent < efip->efi_format.efi_nextents);
 	extp = &(efip->efi_format.efi_extents[next_extent]);
 	extp->ext_start = start_block;
 	extp->ext_len = ext_len;
-	efip->efi_next_extent++;
 }
 
 
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de..35162c238fa 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 
 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
 void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
 				int flags);
-void	xfs_trans_item_committed(struct xfs_log_item *lip,
-				xfs_lsn_t commit_lsn, int aborted);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
+void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+				xfs_lsn_t commit_lsn, int aborted);
 /*
  * AIL traversal cursor.
  *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
  * From xfs_trans_ail.c
  */
-void			xfs_trans_ail_update(struct xfs_ail *ailp,
-					struct xfs_log_item *lip, xfs_lsn_t lsn)
-					__releases(ailp->xa_lock);
-void			xfs_trans_ail_delete(struct xfs_ail *ailp,
-					struct xfs_log_item *lip)
-					__releases(ailp->xa_lock);
+void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+				struct xfs_log_item **log_items, int nr_items,
+				xfs_lsn_t lsn) __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_update(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
+{
+	xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+
+void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+				struct xfs_log_item **log_items, int nr_items)
+				__releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+	struct xfs_ail	*ailp,
+	xfs_log_item_t	*lip) __releases(ailp->xa_lock)
+{
+	xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+
 void			xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void			xfs_trans_unlocked_item(struct xfs_ail *,
 					xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151..d8e6f8cd6f0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
 			xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 	}
 
-	if (ip->i_d.di_nlink != 0) {
-		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-		     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-		       ip->i_delayed_blks > 0)) &&
-		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-		    (!(ip->i_d.di_flags &
-				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+	if (ip->i_d.di_nlink == 0)
+		return 0;
 
-			/*
-			 * If we can't get the iolock just skip truncating
-			 * the blocks past EOF because we could deadlock
-			 * with the mmap_sem otherwise.  We'll get another
-			 * chance to drop them once the last reference to
-			 * the inode is dropped, so we'll never leak blocks
-			 * permanently.
-			 */
-			error = xfs_free_eofblocks(mp, ip,
-						   XFS_FREE_EOF_TRYLOCK);
-			if (error)
-				return error;
-		}
-	}
+	if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+	     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+	       ip->i_delayed_blks > 0)) &&
+	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
+		/*
+		 * If we can't get the iolock just skip truncating the blocks
+		 * past EOF because we could deadlock with the mmap_sem
+		 * otherwise.  We'll get another chance to drop them once the
+		 * last reference to the inode is dropped, so we'll never leak
+		 * blocks permanently.
+		 *
+		 * Further, check if the inode is being opened, written and
+		 * closed frequently and we have delayed allocation blocks
+		 * oustanding (e.g. streaming writes from the NFS server),
+		 * truncating the blocks past EOF will cause fragmentation to
+		 * occur.
+		 *
+		 * In this case don't do the truncation, either, but we have to
+		 * be careful how we detect this case. Blocks beyond EOF show
+		 * up as i_delayed_blks even when the inode is clean, so we
+		 * need to truncate them away first before checking for a dirty
+		 * release. Hence on the first dirty close we will still remove
+		 * the speculative allocation, but after that we will leave it
+		 * in place.
+		 */
+		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+			return 0;
+
+		error = xfs_free_eofblocks(mp, ip,
+					   XFS_FREE_EOF_TRYLOCK);
+		if (error)
+			return error;
+
+		/* delalloc blocks after truncation means it really is dirty */
+		if (ip->i_delayed_blks)
+			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+	}
 	return 0;
 }