125 files changed, 6166 insertions, 4817 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef966188611..2b78014a124 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	options = tmp_options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
+		int token, r;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
-		if (token < Opt_uname) {
-			int r = match_int(&args[0], &option);
+		switch (token) {
+		case Opt_debug:
+			r = match_int(&args[0], &option);
 			if (r < 0) {
 				P9_DPRINTK(P9_DEBUG_ERROR,
-					"integer field, but no integer?\n");
+					   "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
-		}
-		switch (token) {
-		case Opt_debug:
 			v9ses->debug = option;
 #ifdef CONFIG_NET_9P_DEBUG
 			p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			break;
 
 		case Opt_dfltuid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->dfltuid = option;
 			break;
 		case Opt_dfltgid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->dfltgid = option;
 			break;
 		case Opt_afid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->afid = option;
 			break;
 		case Opt_uname:
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9..598fff1a54e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		while (rdir->head < rdir->tail) {
 			p9stat_init(&st);
-			err = p9stat_read(rdir->buf + rdir->head,
-						rdir->tail - rdir->head, &st,
-						fid->clnt->proto_version);
+			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
+					  rdir->tail - rdir->head, &st);
 			if (err) {
 				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
@@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 	while (err == 0) {
 		if (rdir->tail == rdir->head) {
 			err = p9_client_readdir(fid, rdir->buf, buflen,
-								filp->f_pos);
+						filp->f_pos);
 			if (err <= 0)
 				goto unlock_and_exit;
 
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 
 		while (rdir->head < rdir->tail) {
 
-			err = p9dirent_read(rdir->buf + rdir->head,
-						rdir->tail - rdir->head,
-						&curdirent,
-						fid->clnt->proto_version);
+			err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
+					    rdir->tail - rdir->head,
+					    &curdirent);
 			if (err < 0) {
 				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e3c03db3c78..b5a1076aaa6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	case S_IFSOCK:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			inode->i_fop = &v9fs_file_operations_dotl;
 		} else if (v9fs_proto_dotu(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations;
-			inode->i_fop = &v9fs_file_operations;
 		} else {
 			P9_DPRINTK(P9_DEBUG_ERROR,
 				   "special files without extended mode\n");
diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9..d2c3353d547 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
diff --git a/fs/attr.c b/fs/attr.c
index 538e27959d3..7ee7ba48831 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -13,6 +13,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/security.h>
+#include <linux/evm.h>
 
 /**
  * inode_change_ok - check if attribute changes to an inode are allowed
@@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	else
 		error = simple_setattr(dentry, attr);
 
-	if (!error)
+	if (!error) {
 		fsnotify_change(dentry, ia_valid);
+		evm_inode_post_setattr(dentry, ia_valid);
+	}
 
 	return error;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 538f65a79ec..dae5dfe41ba 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1047,7 +1047,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	if (!max_to_defrag)
 		max_to_defrag = last_index - 1;
 
-	while (i <= last_index && defrag_count < max_to_defrag) {
+	/*
+	 * make writeback starts from i, so the defrag range can be
+	 * written sequentially.
+	 */
+	if (i < inode->i_mapping->writeback_index)
+		inode->i_mapping->writeback_index = i;
+
+	while (i <= last_index && defrag_count < max_to_defrag &&
+	       (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT)) {
 		/*
 		 * make sure we stop running if someone unmounts
 		 * the FS
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a..426aa464f1a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -383,36 +383,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 				XATTR_REPLACE);
 }
 
-int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-			      struct inode *inode, struct inode *dir,
-			      const struct qstr *qstr)
+int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		     void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *suffix;
+	const struct xattr *xattr;
+	struct btrfs_trans_handle *trans = fs_info;
 	char *name;
+	int err = 0;
 
-	err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-					   &len);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
-	}
-
-	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
-		       GFP_NOFS);
-	if (!name) {
-		err = -ENOMEM;
-	} else {
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+			       strlen(xattr->name) + 1, GFP_NOFS);
+		if (!name) {
+			err = -ENOMEM;
+			break;
+		}
 		strcpy(name, XATTR_SECURITY_PREFIX);
-		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-		err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+		err = __btrfs_setxattr(trans, inode, name,
+				       xattr->value, xattr->value_len, 0);
 		kfree(name);
+		if (err < 0)
+			break;
 	}
-
-	kfree(suffix);
-	kfree(value);
 	return err;
 }
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &btrfs_initxattrs, trans);
+}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f70d87d6ba6..d545a95c30e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3018,9 +3018,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
 	kfree(volume_info->username);
 	kzfree(volume_info->password);
-	kfree(volume_info->UNC);
 	if (volume_info->UNCip != volume_info->UNC + 2)
 		kfree(volume_info->UNCip);
+	kfree(volume_info->UNC);
 	kfree(volume_info->domainname);
 	kfree(volume_info->iocharset);
 	kfree(volume_info->prepath);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index fde15b1a1eb..45f07c46f3e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -31,16 +32,8 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
-#define CIFS_XATTR_USER_PREFIX "user."
-#define CIFS_XATTR_SYSTEM_PREFIX "system."
-#define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX "security."
-#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN  8
-#define XATTR_SECURITY_PREFIX_LEN 9
-/* BB need to add server (Samba e.g) support for security and trusted prefix */
-
 
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
 
 int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 {
@@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 	}
 	if (ea_name == NULL) {
 		cFYI(1, "Null xattr names not supported");
-	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
-		&& (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		&& (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
 		cFYI(1,
 		     "illegal xattr request %s (only user namespace supported)",
 		     ea_name);
@@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto remove_ea_exit;
 
-		ea_name += 5; /* skip past user. prefix */
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
 			(__u16)0, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 
 	if (ea_name == NULL) {
 		cFYI(1, "Null xattr names not supported");
-	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
 			cFYI(1, "attempt to set cifs inode metadata");
 
-		ea_name += 5; /* skip past user. prefix */
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
 			(__u16)value_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
+		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 
-		ea_name += 4; /* skip past os2. prefix */
+		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
 			(__u16)value_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	/* return alt name if available as pseudo attr */
 	if (ea_name == NULL) {
 		cFYI(1, "Null xattr names not supported");
-	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
@@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			cFYI(1, "attempt to query cifs inode metadata");
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
-		ea_name += 5; /* skip past user. prefix */
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
 		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
 			buf_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
-		ea_name += 4; /* skip past os2. prefix */
+		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
 		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
 			buf_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 		cFYI(1, "Query CIFS ACL not supported yet");
 #endif /* CONFIG_CIFS_ACL */
 	} else if (strncmp(ea_name,
-		  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+		  XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
 		cFYI(1, "Trusted xattr namespace not supported yet");
 	} else if (strncmp(ea_name,
-		  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+		  XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
 		cFYI(1, "Security xattr namespace not supported yet");
 	} else
 		cFYI(1,
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 44e17e9c21a..cc0ea9fe5ec 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -59,12 +59,11 @@ void coda_sysctl_clean(void);
 
 #define CODA_ALLOC(ptr, cast, size) do { \
     if (size < PAGE_SIZE) \
-        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+        ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
     else \
-        ptr = (cast)vmalloc((unsigned long) size); \
+        ptr = (cast)vzalloc((unsigned long) size); \
     if (!ptr) \
         printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
-    else memset( ptr, 0, size ); \
 } while (0)
 
 
diff --git a/fs/compat.c b/fs/compat.c
index 58b1da45989..05e3f3d2cd7 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
 #include <linux/tsacct_kern.h>
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c83f4768eea..ca418aaf635 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -23,7 +23,8 @@
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please see Documentation/filesystems/configfs.txt for more information.
+ * Please see Documentation/filesystems/configfs/configfs.txt for more
+ * information.
  */
 
 #undef DEBUG
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 76dc4c3e5d5..50cee7f9110 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -23,7 +23,7 @@
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please see the file Documentation/filesystems/configfs.txt for
+ * Please see the file Documentation/filesystems/configfs/configfs.txt for
  * critical information about using the config_item interface.
  */
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f0732..f3a257d7a98 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
 /*
- *  file.c - part of debugfs, a tiny little debug file system
+ *  inode.c - part of debugfs, a tiny little debug file system
  *
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index fe047d966dc..9026fc91fe3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -700,7 +700,7 @@ static const struct file_operations eventpoll_fops = {
 	.llseek		= noop_llseek,
 };
 
-/* Fast test to see if the file is an evenpoll file */
+/* Fast test to see if the file is an eventpoll file */
 static inline int is_file_epoll(struct file *f)
 {
 	return f->f_op == &eventpoll_fops;
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c4..352ba149d23 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
 #
 
 # ore module library
-obj-$(CONFIG_ORE) += ore.o
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
 
 exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae414929..fa9a286c877 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
 config ORE
 	tristate
+	depends on EXOFS_FS
+	select ASYNC_XOR
+	default SCSI_OSD_ULD
 
 config EXOFS_FS
 	tristate "exofs: OSD based file system support"
 	depends on SCSI_OSD_ULD
-	select ORE
 	help
 	  EXOFS is a file system that uses an OSD storage device,
 	  as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec744..51f4b4c40f0 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+struct exofs_dev {
+	struct ore_dev ored;
+	unsigned did;
+};
 /*
  * our extension to the in-memory superblock
  */
@@ -66,13 +70,9 @@ struct exofs_sb_info {
 	u32		s_next_generation;	/* next gen # to use          */
 	atomic_t	s_curr_pending;		/* number of pending commands */
 
-	struct pnfs_osd_data_map data_map;	/* Default raid to use
-						 * FIXME: Needed ?
-						 */
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
-	struct ore_components comps;		/* comps for the partition    */
-	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
+	struct ore_components oc;		/* comps for the partition    */
 };
 
 /*
@@ -86,7 +86,7 @@ struct exofs_i_info {
 	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
 	uint64_t       i_commit_size;      /* the object's written length     */
 	struct ore_comp one_comp;	   /* same component for all devices  */
-	struct ore_components comps;	   /* inode view of the device table  */
+	struct ore_components oc;	   /* inode view of the device table  */
 };
 
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
  * bigger and that the device table repeats twice.
  * See: exofs_read_lookup_dev_table()
  */
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
 				    struct ore_comp *one_comp,
 				    struct exofs_sb_info *sbi, osd_id oid)
 {
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps,
 	one_comp->obj.id = oid;
 	exofs_make_credential(one_comp->cred, &one_comp->obj);
 
-	comps->numdevs = sbi->comps.numdevs;
-	comps->single_comp = EC_SINGLE_COMP;
-	comps->comps = one_comp;
+	oc->first_dev = 0;
+	oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+							sbi->layout.group_count;
+	oc->single_comp = EC_SINGLE_COMP;
+	oc->comps = one_comp;
 
 	/* Round robin device view of the table */
-	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
-	comps->ods = sbi->comps.ods + first_dev;
+	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+	oc->ods = &sbi->oc.ods[first_dev];
 }
 
 #endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc234..3e5f3a6be90 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
 
 #define EXOFS_DBGMSG2(M...) do {} while (0)
 
-enum { BIO_MAX_PAGES_KMALLOC =
-		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-	MAX_PAGES_KMALLOC =
-		PAGE_SIZE / sizeof(struct page *),
-};
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
 
 unsigned exofs_max_io_pages(struct ore_layout *layout,
 			    unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
 	unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
 
 	/* TODO: easily support bio chaining */
-	pages =  min_t(unsigned, pages,
-		       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+	pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
 	return pages;
 }
 
@@ -68,6 +63,7 @@ struct page_collect {
 	bool read_4_write; /* This means two things: that the read is sync
 			    * And the pages should not be unlocked.
 			    */
+	struct page *that_locked_page;
 };
 
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	pcol->length = 0;
 	pcol->pg_first = -1;
 	pcol->read_4_write = false;
+	pcol->that_locked_page = NULL;
 }
 
 static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
 	pcol->length = 0;
 	pcol->pg_first = -1;
 	pcol->ios = NULL;
+	pcol->that_locked_page = NULL;
 
 	/* this is probably the end of the loop but in writes
 	 * it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
 	return 0;
 }
 
+enum {PAGE_WAS_NOT_IN_IO = 17};
 static int update_read_page(struct page *page, int ret)
 {
-	if (ret == 0) {
+	switch (ret) {
+	case 0:
 		/* Everything is OK */
 		SetPageUptodate(page);
 		if (PageError(page))
 			ClearPageError(page);
-	} else if (ret == -EFAULT) {
+		break;
+	case -EFAULT:
 		/* In this case we were trying to read something that wasn't on
 		 * disk yet - return a page full of zeroes.  This should be OK,
 		 * because the object should be empty (if there was a write
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret)
 		SetPageUptodate(page);
 		if (PageError(page))
 			ClearPageError(page);
-		ret = 0; /* recovered error */
 		EXOFS_DBGMSG("recovered read error\n");
-	} else /* Error */
+		/* fall through */
+	case PAGE_WAS_NOT_IN_IO:
+		ret = 0; /* recovered error */
+		break;
+	default:
 		SetPageError(page);
-
+	}
 	return ret;
 }
 
 static void update_write_page(struct page *page, int ret)
 {
+	if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+		return; /* don't pass start don't collect $200 */
+
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		SetPageError(page);
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret)
 static int __readpages_done(struct page_collect *pcol)
 {
 	int i;
-	u64 resid;
 	u64 good_bytes;
 	u64 length = 0;
-	int ret = ore_check_io(pcol->ios, &resid);
+	int ret = ore_check_io(pcol->ios, NULL);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
 		good_bytes = pcol->length;
-	else
-		good_bytes = pcol->length - resid;
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
+		good_bytes = 0;
+	}
 
 	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 	}
 }
 
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+	struct page_collect *pcol_src, struct page_collect *pcol)
+{
+	/* length was wrong or offset was not page aligned */
+	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+	if (pcol_src->nr_pages > ios->nr_pages) {
+		struct page **src_page;
+		unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+		unsigned long len_less = pcol_src->length - ios->length;
+		unsigned i;
+		int ret;
+
+		/* This IO was trimmed */
+		pcol_src->nr_pages = ios->nr_pages;
+		pcol_src->length = ios->length;
+
+		/* Left over pages are passed to the next io */
+		pcol->expected_pages += pages_less;
+		pcol->nr_pages = pages_less;
+		pcol->length = len_less;
+		src_page = pcol_src->pages + pcol_src->nr_pages;
+		pcol->pg_first = (*src_page)->index;
+
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			return ret;
+
+		for (i = 0; i < pages_less; ++i)
+			pcol->pages[i] = *src_page++;
+
+		EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+			"pages_less=0x%x expected_pages=0x%x "
+			"next_offset=0x%llx next_len=0x%lx\n",
+			pcol_src->nr_pages, pages_less, pcol->expected_pages,
+			pcol->pg_first * PAGE_SIZE, pcol->length);
+	}
+	return 0;
+}
+
 static int read_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol)
 		return 0;
 
 	if (!pcol->ios) {
-		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
 					     pcol->pg_first << PAGE_CACHE_SHIFT,
 					     pcol->length, &pcol->ios);
 
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol)
 
 	ios = pcol->ios;
 	ios->pages = pcol->pages;
-	ios->nr_pages = pcol->nr_pages;
 
 	if (pcol->read_4_write) {
 		ore_read(pcol->ios);
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol)
 	*pcol_copy = *pcol;
 	ios->done = readpages_done;
 	ios->private = pcol_copy;
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
 	ret = ore_read(ios);
 	if (unlikely(ret))
 		goto err;
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-		  oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
 	return 0;
 
 err:
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
 		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
 			  page->index);
 
+	pcol->that_locked_page = page;
+
 	if (page->index < end_index)
 		len = PAGE_CACHE_SIZE;
 	else if (page->index == end_index)
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
+	ret = read_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
 	return read_exec(&pcol);
 }
 
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 	int i;
-	u64 resid;
 	u64  good_bytes;
 	u64  length = 0;
-	int ret = ore_check_io(ios, &resid);
+	int ret = ore_check_io(ios, NULL);
 
 	atomic_dec(&pcol->sbi->s_curr_pending);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
 		good_bytes = pcol->length;
-	else
-		good_bytes = pcol->length - resid;
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
+		good_bytes = 0;
+	}
 
 	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 	EXOFS_DBGMSG2("writepages_done END\n");
 }
 
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+	struct page_collect *pcol = priv;
+	pgoff_t index = offset / PAGE_SIZE;
+
+	if (!pcol->that_locked_page ||
+	    (pcol->that_locked_page->index != index)) {
+		struct page *page = find_get_page(pcol->inode->i_mapping, index);
+
+		if (!page) {
+			page = find_or_create_page(pcol->inode->i_mapping,
+						   index, GFP_NOFS);
+			if (unlikely(!page)) {
+				EXOFS_DBGMSG("grab_cache_page Failed "
+					"index=0x%llx\n", _LLU(index));
+				return NULL;
+			}
+			unlock_page(page);
+		}
+		if (PageDirty(page) || PageWriteback(page))
+			*uptodate = true;
+		else
+			*uptodate = PageUptodate(page);
+		EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+		return page;
+	} else {
+		EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+			     pcol->that_locked_page->index);
+		*uptodate = true;
+		return pcol->that_locked_page;
+	}
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+	struct page_collect *pcol = priv;
+
+	if (pcol->that_locked_page != page) {
+		EXOFS_DBGMSG("index=0x%lx\n", page->index);
+		page_cache_release(page);
+		return;
+	}
+	EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+	.get_page = &__r4w_get_page,
+	.put_page = &__r4w_put_page,
+};
+
 static int write_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol)
 		return 0;
 
 	BUG_ON(pcol->ios);
-	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
 				 pcol->pg_first << PAGE_CACHE_SHIFT,
 				 pcol->length, &pcol->ios);
-
 	if (unlikely(ret))
 		goto err;
 
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol)
 
 	ios = pcol->ios;
 	ios->pages = pcol_copy->pages;
-	ios->nr_pages = pcol_copy->nr_pages;
 	ios->done = writepages_done;
+	ios->r4w = &_r4w_op;
 	ios->private = pcol_copy;
 
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
 	ret = ore_write(ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol)
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
-		  pcol->length);
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
 	return 0;
 
 err:
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping,
 	_pcol_init(&pcol, expected_pages, mapping->host);
 
 	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-	if (ret) {
+	if (unlikely(ret)) {
 		EXOFS_ERR("write_cache_pages => %d\n", ret);
 		return ret;
 	}
 
-	return write_exec(&pcol);
+	ret = write_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		return write_exec(&pcol); /* pump the last reminder */
+	} else if (pcol.nr_pages) {
+		/* not SYNC let the reminder join the next writeout */
+		unsigned i;
+
+		for (i = 0; i < pcol.nr_pages; i++) {
+			struct page *page = pcol.pages[i];
+
+			end_page_writeback(page);
+			set_page_dirty(page);
+			unlock_page(page);
+		}
+	}
+	return 0;
 }
 
+/*
 static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct page_collect pcol;
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 
 	return write_exec(&pcol);
 }
-
+*/
 /* i_mutex held using inode->i_size directly */
 static void _write_failed(struct inode *inode, loff_t to)
 {
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
-	.writepage	= exofs_writepage,
+	.writepage	= NULL,
 	.writepages	= exofs_writepages,
 	.write_begin	= exofs_write_begin_export,
 	.write_end	= exofs_write_end,
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
-	ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+	ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
 	if (likely(!ret))
 		truncate_setsize(inode, newsize);
 
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	struct exofs_on_disk_inode_layout *layout;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
 	}
 
-	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
-	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
 
 	ios->in_attr = attrs;
 	ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 	oi = exofs_i(inode);
 	__oi_init(oi);
-	exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
 			 exofs_oi_objno(oi));
 
 	/* read the inode from the osd */
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	spin_unlock(&sbi->s_next_gen_lock);
 	insert_inode_hash(inode);
 
-	exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
 			 exofs_oi_objno(oi));
 	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
 
 	mark_inode_dirty(inode);
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
 		return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		goto free_args;
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode)
 	/* ignore the error, attempt a remove anyway */
 
 	/* Now Remove the OSD objects */
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
 		return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af8819..fcfa86ae6fa 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,76 +24,287 @@
 
 #include <linux/slab.h>
 #include <asm/div64.h>
+#include <linux/lcm.h>
 
-#include <scsi/osd_ore.h>
+#include "ore_raid.h"
 
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+MODULE_LICENSE("GPL");
+
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ *    members to be operatonals for the ore. The needed parameters are those
+ *    that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ *    for example stripe_unit must be a multple of the system PAGE_SIZE,
+ *    and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
 
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
-	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
-	do { if (0) printk(fmt, ##a); } while (0)
-#endif
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+	u64 stripe_length;
+
+	switch (layout->raid_algorithm) {
+	case PNFS_OSD_RAID_0:
+		layout->parity = 0;
+		break;
+	case PNFS_OSD_RAID_5:
+		layout->parity = 1;
+		break;
+	case PNFS_OSD_RAID_PQ:
+	case PNFS_OSD_RAID_4:
+	default:
+		ORE_ERR("Only RAID_0/5 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+		ORE_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(layout->stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+	if (layout->group_width) {
+		if (!layout->group_depth) {
+			ORE_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+			ORE_ERR("Data Map wrong, "
+				"numdevs=%d < group_width=%d * mirrors=%d\n",
+				total_comps, layout->group_width,
+				layout->mirrors_p1);
+			return -EINVAL;
+		}
+		layout->group_count = total_comps / layout->mirrors_p1 /
+						layout->group_width;
+	} else {
+		if (layout->group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %lld\n",
+				_LLU(layout->group_depth));
+		}
+		layout->group_width = total_comps / layout->mirrors_p1;
+		layout->group_depth = -1;
+		layout->group_count = 1;
+	}
 
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
+	stripe_length = (u64)layout->group_width * layout->stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+			_LLU(stripe_length));
+		return -EINVAL;
+	}
 
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
+	layout->max_io_length =
+		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+							layout->group_width;
+	if (layout->parity) {
+		unsigned stripe_length =
+				(layout->group_width - layout->parity) *
+				layout->stripe_unit;
 
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
+		layout->max_io_length /= stripe_length;
+		layout->max_io_length *= stripe_length;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
 
 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
 {
-	return ios->comps->comps[index & ios->comps->single_comp].cred;
+	return ios->oc->comps[index & ios->oc->single_comp].cred;
 }
 
 static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
 {
-	return &ios->comps->comps[index & ios->comps->single_comp].obj;
+	return &ios->oc->comps[index & ios->oc->single_comp].obj;
 }
 
 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 {
-	return ios->comps->ods[index];
+	ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+		    ios->oc->first_dev, ios->oc->numdevs, index,
+		    ios->oc->ods);
+
+	return ore_comp_dev(ios->oc, index);
 }
 
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios)
+{
+	struct ore_io_state *ios;
+	struct page **pages;
+	struct osd_sg_entry *sgilist;
+	struct __alloc_all_io_state {
+		struct ore_io_state ios;
+		struct ore_per_dev_state per_dev[numdevs];
+		union {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		};
+	} *_aios;
+
+	if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+		_aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+		if (unlikely(!_aios)) {
+			ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+				   sizeof(*_aios));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		pages = num_par_pages ? _aios->pages : NULL;
+		sgilist = sgs_per_dev ? _aios->sglist : NULL;
+		ios = &_aios->ios;
+	} else {
+		struct __alloc_small_io_state {
+			struct ore_io_state ios;
+			struct ore_per_dev_state per_dev[numdevs];
+		} *_aio_small;
+		union __extra_part {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		} *extra_part;
+
+		_aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+		if (unlikely(!_aio_small)) {
+			ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+				   sizeof(*_aio_small));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+		if (unlikely(!extra_part)) {
+			ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+				   sizeof(*extra_part));
+			kfree(_aio_small);
+			*pios = NULL;
+			return -ENOMEM;
+		}
+
+		pages = num_par_pages ? extra_part->pages : NULL;
+		sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+		/* In this case the per_dev[0].sgilist holds the pointer to
+		 * be freed
+		 */
+		ios = &_aio_small->ios;
+		ios->extra_part_alloc = true;
+	}
+
+	if (pages) {
+		ios->parity_pages = pages;
+		ios->max_par_pages = num_par_pages;
+	}
+	if (sgilist) {
+		unsigned d;
+
+		for (d = 0; d < numdevs; ++d) {
+			ios->per_dev[d].sglist = sgilist;
+			sgilist += sgs_per_dev;
+		}
+		ios->sgs_per_dev = sgs_per_dev;
+	}
+
+	ios->layout = layout;
+	ios->oc = oc;
+	*pios = ios;
+	return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ *   (@offset + @length) % strip_size == 0. Or the complete range is within a
+ *   single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ *   And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 		      bool is_reading, u64 offset, u64 length,
 		      struct ore_io_state **pios)
 {
 	struct ore_io_state *ios;
+	unsigned numdevs = layout->group_width * layout->mirrors_p1;
+	unsigned sgs_per_dev = 0, max_par_pages = 0;
+	int ret;
 
-	/*TODO: Maybe use kmem_cach per sbi of size
-	 * exofs_io_state_size(layout->s_numdevs)
-	 */
-	ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
-	if (unlikely(!ios)) {
-		ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-			     ore_io_state_size(comps->numdevs));
-		*pios = NULL;
-		return -ENOMEM;
+	if (layout->parity && length) {
+		unsigned data_devs = layout->group_width - layout->parity;
+		unsigned stripe_size = layout->stripe_unit * data_devs;
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		u32 remainder;
+		u64 num_stripes;
+		u64 num_raid_units;
+
+		num_stripes = div_u64_rem(length, stripe_size, &remainder);
+		if (remainder)
+			++num_stripes;
+
+		num_raid_units =  num_stripes * layout->parity;
+
+		if (is_reading) {
+			/* For reads add per_dev sglist array */
+			/* TODO: Raid 6 we need twice more. Actually:
+			*         num_stripes / LCMdP(W,P);
+			*         if (W%P != 0) num_stripes *= parity;
+			*/
+
+			/* first/last seg is split */
+			num_raid_units += layout->group_width;
+			sgs_per_dev = div_u64(num_raid_units, data_devs);
+		} else {
+			/* For Writes add parity pages array. */
+			max_par_pages = num_raid_units * pages_in_unit *
+						sizeof(struct page *);
+		}
 	}
 
-	ios->layout = layout;
-	ios->comps = comps;
-	ios->offset = offset;
-	ios->length = length;
+	ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+				pios);
+	if (unlikely(ret))
+		return ret;
+
+	ios = *pios;
 	ios->reading = is_reading;
+	ios->offset = offset;
+
+	if (length) {
+		ore_calc_stripe_info(layout, offset, length, &ios->si);
+		ios->length = ios->si.length;
+		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		if (layout->parity)
+			_ore_post_alloc_raid_stuff(ios);
+	}
 
-	*pios = ios;
 	return 0;
 }
 EXPORT_SYMBOL(ore_get_rw_state);
 
-int  ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
-		      struct ore_io_state **ios)
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+		      struct ore_io_state **pios)
 {
-	return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+	return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
 }
 EXPORT_SYMBOL(ore_get_io_state);
 
@@ -111,6 +322,7 @@ void ore_put_io_state(struct ore_io_state *ios)
 				bio_put(per_dev->bio);
 		}
 
+		_ore_free_raid_stuff(ios);
 		kfree(ios);
 	}
 }
@@ -138,7 +350,7 @@ static void _done_io(struct osd_request *or, void *p)
 	kref_put(&ios->kref, _last_io);
 }
 
-static int ore_io_execute(struct ore_io_state *ios)
+int ore_io_execute(struct ore_io_state *ios)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	bool sync = (ios->done == NULL);
@@ -198,7 +410,7 @@ static void _clear_bio(struct bio *bio)
 	}
 }
 
-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 {
 	enum osd_err_priority acumulated_osd_err = 0;
 	int acumulated_lin_err = 0;
@@ -206,7 +418,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
 
 	for (i = 0; i < ios->numdevs; i++) {
 		struct osd_sense_info osi;
-		struct osd_request *or = ios->per_dev[i].or;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+		struct osd_request *or = per_dev->or;
 		int ret;
 
 		if (unlikely(!or))
@@ -218,29 +431,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
 
 		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
 			/* start read offset passed endof file */
-			_clear_bio(ios->per_dev[i].bio);
+			_clear_bio(per_dev->bio);
 			ORE_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
-				_LLU(ios->per_dev[i].offset),
-				_LLU(ios->per_dev[i].length));
+				_LLU(per_dev->offset),
+				_LLU(per_dev->length));
 
 			continue; /* we recovered */
 		}
 
+		if (on_dev_error) {
+			u64 residual = ios->reading ?
+					or->in.residual : or->out.residual;
+			u64 offset = (ios->offset + ios->length) - residual;
+			struct ore_dev *od = ios->oc->ods[
+					per_dev->dev - ios->oc->first_dev];
+
+			on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+				     offset, residual);
+		}
 		if (osi.osd_err_pri >= acumulated_osd_err) {
 			acumulated_osd_err = osi.osd_err_pri;
 			acumulated_lin_err = ret;
 		}
 	}
 
-	/* TODO: raid specific residual calculations */
-	if (resid) {
-		if (likely(!acumulated_lin_err))
-			*resid = 0;
-		else
-			*resid = ios->length;
-	}
-
 	return acumulated_lin_err;
 }
 EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +463,65 @@ EXPORT_SYMBOL(ore_check_io);
 /*
  * L - logical offset into the file
  *
- * U - The number of bytes in a stripe within a group
+ * D - number of Data devices
+ *	D = group_width - parity
  *
- *	U = stripe_unit * group_width
+ * U - The number of bytes in a stripe within a group
+ *	U =  stripe_unit * D
  *
  * T - The number of bytes striped within a group of component objects
  *     (before advancing to the next group)
- *
- *	T = stripe_unit * group_width * group_depth
+ *	T = U * group_depth
  *
  * S - The number of bytes striped across all component objects
  *     before the pattern repeats
+ *	S = T * group_count
  *
- *	S = stripe_unit * group_width * group_depth * group_count
- *
- * M - The "major" (i.e., across all components) stripe number
- *
+ * M - The "major" (i.e., across all components) cycle number
  *	M = L / S
  *
- * G - Counts the groups from the beginning of the major stripe
- *
+ * G - Counts the groups from the beginning of the major cycle
  *	G = (L - (M * S)) / T	[or (L % S) / T]
  *
  * H - The byte offset within the group
- *
  *	H = (L - (M * S)) % T	[or (L % S) % T]
  *
  * N - The "minor" (i.e., across the group) stripe number
- *
  *	N = H / U
  *
  * C - The component index coresponding to L
  *
- *	C = (H - (N * U)) / stripe_unit + G * group_width
- *	[or (L % U) / stripe_unit + G * group_width]
+ *	C = (H - (N * U)) / stripe_unit + G * D
+ *	[or (L % U) / stripe_unit + G * D]
  *
  * O - The component offset coresponding to L
- *
  *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ *          divide by parity
+ *	LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ *     (Note parity cycle always starts at a group's boundary)
+ *	R = N % LCMdP
+ *
+ * I = the first parity device index
+ *	I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ *	Craid = (group_width + C - R*parity) % group_width
+ *      (We add the group_width to avoid negative numbers modulo math)
  */
-struct _striping_info {
-	u64 obj_offset;
-	u64 group_length;
-	u64 M; /* for truncate */
-	unsigned dev;
-	unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			      struct _striping_info *si)
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+			  u64 length, struct ore_striping_info *si)
 {
 	u32	stripe_unit = layout->stripe_unit;
 	u32	group_width = layout->group_width;
 	u64	group_depth = layout->group_depth;
+	u32	parity      = layout->parity;
 
-	u32	U = stripe_unit * group_width;
+	u32	D = group_width - parity;
+	u32	U = D * stripe_unit;
 	u64	T = U * group_depth;
 	u64	S = T * layout->group_count;
 	u64	M = div64_u64(file_offset, S);
@@ -318,39 +537,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	u32	N = div_u64(H, U);
 
 	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
-	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-	si->dev *= layout->mirrors_p1;
+	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
 
 	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
 
 	si->obj_offset = si->unit_off + (N * stripe_unit) +
 				  (M * group_depth * stripe_unit);
 
-	si->group_length = T - H;
+	if (parity) {
+		u32 LCMdP = lcm(group_width, parity) / parity;
+		/* R     = N % LCMdP; */
+		u32 RxP   = (N % LCMdP) * parity;
+		u32 first_dev = C - C % group_width;
+
+		si->par_dev = (group_width + group_width - parity - RxP) %
+			      group_width + first_dev;
+		si->dev = (group_width + C - RxP) % group_width + first_dev;
+		si->bytes_in_stripe = U;
+		si->first_stripe_start = M * S + G * T + N * U;
+	} else {
+		/* Make the math correct see _prepare_one_group */
+		si->par_dev = group_width;
+		si->dev = C;
+	}
+
+	si->dev *= layout->mirrors_p1;
+	si->par_dev *= layout->mirrors_p1;
+	si->offset = file_offset;
+	si->length = T - H;
+	if (si->length > length)
+		si->length = length;
 	si->M = M;
 }
+EXPORT_SYMBOL(ore_calc_stripe_info);
 
-static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-		unsigned pgbase, struct ore_per_dev_state *per_dev,
-		int cur_len)
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+			 unsigned pgbase, struct page **pages,
+			 struct ore_per_dev_state *per_dev, int cur_len)
 {
 	unsigned pg = *cur_pg;
 	struct request_queue *q =
 			osd_request_queue(_ios_od(ios, per_dev->dev));
-
-	per_dev->length += cur_len;
+	unsigned len = cur_len;
+	int ret;
 
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
-						ios->layout->group_width;
+		unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
+					(ios->layout->group_width -
+					 ios->layout->parity);
+		unsigned bio_size = (nr_pages + pages_in_stripe) /
+					ios->layout->group_width;
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
 			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
 				     bio_size);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out;
 		}
 	}
 
@@ -358,64 +603,90 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
 		unsigned added_len;
 
-		BUG_ON(ios->nr_pages <= pg);
 		cur_len -= pglen;
 
-		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
 					    pglen, pgbase);
-		if (unlikely(pglen != added_len))
-			return -ENOMEM;
+		if (unlikely(pglen != added_len)) {
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+				   per_dev->bio->bi_vcnt);
+			ret = -ENOMEM;
+			goto out;
+		}
+		_add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
 		pgbase = 0;
 		++pg;
 	}
 	BUG_ON(cur_len);
 
+	per_dev->length += len;
 	*cur_pg = pg;
-	return 0;
+	ret = 0;
+out:	/* we fail the complete unit on an error eg don't advance
+	 * per_dev->length and cur_pg. This means that we might have a bigger
+	 * bio than the CDB requested length (per_dev->length). That's fine
+	 * only the oposite is fatal.
+	 */
+	return ret;
 }
 
-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
-			      struct _striping_info *si)
+static int _prepare_for_striping(struct ore_io_state *ios)
 {
+	struct ore_striping_info *si = &ios->si;
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
-	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned group_width = ios->layout->group_width;
+	unsigned devs_in_group = group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned dev_order;
 	unsigned cur_pg = ios->pages_consumed;
+	u64 length = ios->length;
 	int ret = 0;
 
+	if (!ios->pages) {
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	BUG_ON(length > si->length);
+
+	dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
+	si->cur_comp = dev_order;
+	si->cur_pg = si->unit_off / PAGE_SIZE;
+
 	while (length) {
-		struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+		unsigned comp = dev - first_dev;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
 		unsigned cur_len, page_off = 0;
 
 		if (!per_dev->length) {
 			per_dev->dev = dev;
-			if (dev < si->dev) {
-				per_dev->offset = si->obj_offset + stripe_unit -
-								   si->unit_off;
-				cur_len = stripe_unit;
-			} else if (dev == si->dev) {
+			if (dev == si->dev) {
+				WARN_ON(dev == si->par_dev);
 				per_dev->offset = si->obj_offset;
 				cur_len = stripe_unit - si->unit_off;
 				page_off = si->unit_off & ~PAGE_MASK;
 				BUG_ON(page_off && (page_off != ios->pgbase));
-			} else { /* dev > si->dev */
-				per_dev->offset = si->obj_offset - si->unit_off;
+			} else {
+				if (si->cur_comp > dev_order)
+					per_dev->offset =
+						si->obj_offset - si->unit_off;
+				else /* si->cur_comp < dev_order */
+					per_dev->offset =
+						si->obj_offset + stripe_unit -
+								   si->unit_off;
 				cur_len = stripe_unit;
 			}
-
-			if (max_comp < dev)
-				max_comp = dev;
 		} else {
 			cur_len = stripe_unit;
 		}
 		if (cur_len >= length)
 			cur_len = length;
 
-		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-				       cur_len);
+		ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+					   per_dev, cur_len);
 		if (unlikely(ret))
 			goto out;
 
@@ -423,60 +694,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
 		dev = (dev % devs_in_group) + first_dev;
 
 		length -= cur_len;
-	}
-out:
-	ios->numdevs = max_comp + mirrors_p1;
-	ios->pages_consumed = cur_pg;
-	return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
-	u64 length = ios->length;
-	u64 offset = ios->offset;
-	struct _striping_info si;
-	int ret = 0;
 
-	if (!ios->pages) {
-		if (ios->kern_buff) {
-			struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+		si->cur_comp = (si->cur_comp + 1) % group_width;
+		if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+			if (!length && ios->sp2d) {
+				/* If we are writing and this is the very last
+				 * stripe. then operate on parity dev.
+				 */
+				dev = si->par_dev;
+			}
+			if (ios->sp2d)
+				/* In writes cur_len just means if it's the
+				 * last one. See _ore_add_parity_unit.
+				 */
+				cur_len = length;
+			per_dev = &ios->per_dev[dev - first_dev];
+			if (!per_dev->length) {
+				/* Only/always the parity unit of the first
+				 * stripe will be empty. So this is a chance to
+				 * initialize the per_dev info.
+				 */
+				per_dev->dev = dev;
+				per_dev->offset = si->obj_offset - si->unit_off;
+			}
 
-			_calc_stripe_info(ios->layout, ios->offset, &si);
-			per_dev->offset = si.obj_offset;
-			per_dev->dev = si.dev;
+			ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+			if (unlikely(ret))
+					goto out;
 
-			/* no cross device without page array */
-			BUG_ON((ios->layout->group_width > 1) &&
-			       (si.unit_off + ios->length >
-				ios->layout->stripe_unit));
+			/* Rotate next par_dev backwards with wraping */
+			si->par_dev = (devs_in_group + si->par_dev -
+				       ios->layout->parity * mirrors_p1) %
+				      devs_in_group + first_dev;
+			/* Next stripe, start fresh */
+			si->cur_comp = 0;
+			si->cur_pg = 0;
 		}
-		ios->numdevs = ios->layout->mirrors_p1;
-		return 0;
-	}
-
-	while (length) {
-		_calc_stripe_info(ios->layout, offset, &si);
-
-		if (length < si.group_length)
-			si.group_length = length;
-
-		ret = _prepare_one_group(ios, si.group_length, &si);
-		if (unlikely(ret))
-			goto out;
-
-		offset += si.group_length;
-		length -= si.group_length;
 	}
-
 out:
-	return ret;
+	ios->numdevs = devs_in_group;
+	ios->pages_consumed = cur_pg;
+	if (unlikely(ret)) {
+		if (length == ios->length)
+			return ret;
+		else
+			ios->length -= length;
+	}
+	return 0;
 }
 
 int ore_create(struct ore_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->comps->numdevs; i++) {
+	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
 		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +772,7 @@ int ore_remove(struct ore_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->comps->numdevs; i++) {
+	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
 		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +814,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 			goto out;
 		}
 		per_dev->or = or;
-		per_dev->offset = master_dev->offset;
 
 		if (ios->pages) {
 			struct bio *bio;
@@ -562,6 +832,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
+				per_dev->offset = master_dev->offset;
 				per_dev->length = master_dev->length;
 				per_dev->bio =  bio;
 				per_dev->dev = dev;
@@ -579,7 +850,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				     _LLU(per_dev->offset),
 				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
-			ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+			per_dev->offset = ios->si.obj_offset;
+			per_dev->dev = ios->si.dev + dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (ios->si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+
+			ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
 						 per_dev->offset,
 						 ios->kern_buff, ios->length);
 			if (unlikely(ret))
@@ -588,7 +867,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				      "length=0x%llx dev=%d\n",
 				     _LLU(_ios_obj(ios, dev)->id),
 				     _LLU(per_dev->offset),
-				     _LLU(ios->length), dev);
+				     _LLU(ios->length), per_dev->dev);
 		} else {
 			osd_req_set_attributes(or, _ios_obj(ios, dev));
 			ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +893,14 @@ int ore_write(struct ore_io_state *ios)
 	int i;
 	int ret;
 
+	if (unlikely(ios->sp2d && !ios->r4w)) {
+		/* A library is attempting a RAID-write without providing
+		 * a pages lock interface.
+		 */
+		WARN_ON_ONCE(1);
+		return -ENOTSUPP;
+	}
+
 	ret = _prepare_for_striping(ios);
 	if (unlikely(ret))
 		return ret;
@@ -629,7 +916,7 @@ int ore_write(struct ore_io_state *ios)
 }
 EXPORT_SYMBOL(ore_write);
 
-static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 {
 	struct osd_request *or;
 	struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +935,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 	per_dev->or = or;
 
 	if (ios->pages) {
-		osd_req_read(or, obj, per_dev->offset,
-				per_dev->bio, per_dev->length);
+		if (per_dev->cur_sg) {
+			/* finalize the last sg_entry */
+			_ore_add_sg_seg(per_dev, 0, false);
+			if (unlikely(!per_dev->cur_sg))
+				return 0; /* Skip parity only device */
+
+			osd_req_read_sg(or, obj, per_dev->bio,
+					per_dev->sglist, per_dev->cur_sg);
+		} else {
+			/* The no raid case */
+			osd_req_read(or, obj, per_dev->offset,
+				     per_dev->bio, per_dev->length);
+		}
+
 		ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-			     " dev=%d\n", _LLU(obj->id),
+			     " dev=%d sg_len=%d\n", _LLU(obj->id),
 			     _LLU(per_dev->offset), _LLU(per_dev->length),
-			     first_dev);
-	} else if (ios->kern_buff) {
-		int ret = osd_req_read_kern(or, obj, per_dev->offset,
-					    ios->kern_buff, ios->length);
-		ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-			      "length=0x%llx dev=%d ret=>%d\n",
-			      _LLU(obj->id), _LLU(per_dev->offset),
-			      _LLU(ios->length), first_dev, ret);
-		if (unlikely(ret))
-			return ret;
+			     first_dev, per_dev->cur_sg);
 	} else {
+		BUG_ON(ios->kern_buff);
+
 		osd_req_get_attributes(or, obj);
 		ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
 			      _LLU(obj->id),
@@ -688,7 +980,7 @@ int ore_read(struct ore_io_state *ios)
 		return ret;
 
 	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		ret = _read_mirror(ios, i);
+		ret = _ore_read_mirror(ios, i);
 		if (unlikely(ret))
 			return ret;
 	}
@@ -744,31 +1036,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
 }
 
 struct _trunc_info {
-	struct _striping_info si;
+	struct ore_striping_info si;
 	u64 prev_group_obj_off;
 	u64 next_group_obj_off;
 
 	unsigned first_group_dev;
 	unsigned nex_group_dev;
-	unsigned max_devs;
 };
 
-void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
-		       struct _trunc_info *ti)
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+			     struct _trunc_info *ti)
 {
 	unsigned stripe_unit = layout->stripe_unit;
 
-	_calc_stripe_info(layout, file_offset, &ti->si);
+	ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
 
 	ti->prev_group_obj_off = ti->si.M * stripe_unit;
 	ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
 
 	ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
 	ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-	ti->max_devs = layout->group_width * layout->group_count;
 }
 
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 		   u64 size)
 {
 	struct ore_io_state *ios;
@@ -779,22 +1069,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
 	struct _trunc_info ti;
 	int i, ret;
 
-	ret = ore_get_io_state(layout, comps, &ios);
+	ret = ore_get_io_state(layout, oc, &ios);
 	if (unlikely(ret))
 		return ret;
 
 	_calc_trunk_info(ios->layout, size, &ti);
 
-	size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+	size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
 			     GFP_KERNEL);
 	if (unlikely(!size_attrs)) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	ios->numdevs = ios->comps->numdevs;
+	ios->numdevs = ios->oc->numdevs;
 
-	for (i = 0; i < ti.max_devs; ++i) {
+	for (i = 0; i < ios->numdevs; ++i) {
 		struct exofs_trunc_attr *size_attr = &size_attrs[i];
 		u64 obj_size;
 
@@ -815,7 +1105,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
 		size_attr->attr.val_ptr = &size_attr->newsize;
 
 		ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
-			     _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
 		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
 					&size_attr->attr);
 		if (unlikely(ret))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 00000000000..29c47e5c4a8
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+
+#include "ore_raid.h"
+
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
+struct page *_raid_page_alloc(void)
+{
+	return alloc_page(GFP_KERNEL);
+}
+
+void _raid_page_free(struct page *p)
+{
+	__free_page(p);
+}
+
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+	/* Cache some hot path repeated calculations */
+	unsigned parity;
+	unsigned data_devs;
+	unsigned pages_in_unit;
+
+	bool needed ;
+
+	/* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+	struct __1_page_stripe {
+		bool alloc;
+		unsigned write_count;
+		struct async_submit_ctl submit;
+		struct dma_async_tx_descriptor *tx;
+
+		/* The size of this array is data_devs + parity */
+		struct page **pages;
+		struct page **scribble;
+		/* bool array, size of this array is data_devs */
+		char *page_is_read;
+	} _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+		       unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+	struct __stripe_pages_2d *sp2d;
+	unsigned data_devs = group_width - parity;
+	struct _alloc_all_bytes {
+		struct __alloc_stripe_pages_2d {
+			struct __stripe_pages_2d sp2d;
+			struct __1_page_stripe _1p_stripes[pages_in_unit];
+		} __asp2d;
+		struct __alloc_1p_arrays {
+			struct page *pages[group_width];
+			struct page *scribble[group_width];
+			char page_is_read[data_devs];
+		} __a1pa[pages_in_unit];
+	} *_aab;
+	struct __alloc_1p_arrays *__a1pa;
+	struct __alloc_1p_arrays *__a1pa_end;
+	const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+	unsigned num_a1pa, alloc_size, i;
+
+	/* FIXME: check these numbers in ore_verify_layout */
+	BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+	BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+	if (sizeof(*_aab) > PAGE_SIZE) {
+		num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+		alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+	} else {
+		num_a1pa = pages_in_unit;
+		alloc_size = sizeof(*_aab);
+	}
+
+	_aab = kzalloc(alloc_size, GFP_KERNEL);
+	if (unlikely(!_aab)) {
+		ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+		return -ENOMEM;
+	}
+
+	sp2d = &_aab->__asp2d.sp2d;
+	*psp2d = sp2d; /* From here Just call _sp2d_free */
+
+	__a1pa = _aab->__a1pa;
+	__a1pa_end = __a1pa + num_a1pa;
+
+	for (i = 0; i < pages_in_unit; ++i) {
+		if (unlikely(__a1pa >= __a1pa_end)) {
+			num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+							pages_in_unit - i);
+
+			__a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+			if (unlikely(!__a1pa)) {
+				ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+					   num_a1pa);
+				return -ENOMEM;
+			}
+			__a1pa_end = __a1pa + num_a1pa;
+			/* First *pages is marked for kfree of the buffer */
+			sp2d->_1p_stripes[i].alloc = true;
+		}
+
+		sp2d->_1p_stripes[i].pages = __a1pa->pages;
+		sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+		sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+		++__a1pa;
+	}
+
+	sp2d->parity = parity;
+	sp2d->data_devs = data_devs;
+	sp2d->pages_in_unit = pages_in_unit;
+	return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+			const struct _ore_r4w_op *r4w, void *priv)
+{
+	unsigned data_devs = sp2d->data_devs;
+	unsigned group_width = data_devs + sp2d->parity;
+	unsigned p;
+
+	if (!sp2d->needed)
+		return;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count < group_width) {
+			unsigned c;
+
+			for (c = 0; c < data_devs; c++)
+				if (_1ps->page_is_read[c]) {
+					struct page *page = _1ps->pages[c];
+
+					r4w->put_page(priv, page);
+					_1ps->page_is_read[c] = false;
+				}
+		}
+
+		memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+		_1ps->write_count = 0;
+		_1ps->tx = NULL;
+	}
+
+	sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+	unsigned i;
+
+	if (!sp2d)
+		return;
+
+	for (i = 0; i < sp2d->pages_in_unit; ++i) {
+		if (sp2d->_1p_stripes[i].alloc)
+			kfree(sp2d->_1p_stripes[i].pages);
+	}
+
+	kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (!_1ps->write_count)
+			continue;
+
+		init_async_submit(&_1ps->submit,
+			ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+			NULL,
+			NULL, NULL,
+			(addr_conv_t *)_1ps->scribble);
+
+		/* TODO: raid6 */
+		_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
+				     0, sp2d->data_devs, PAGE_SIZE,
+				     &_1ps->submit);
+	}
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+		/* NOTE: We wait for HW synchronously (I don't have such HW
+		 * to test with.) Is parallelism needed with today's multi
+		 * cores?
+		 */
+		async_tx_issue_pending(_1ps->tx);
+	}
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page)
+{
+	struct __1_page_stripe *_1ps;
+
+	sp2d->needed = true;
+
+	_1ps = &sp2d->_1p_stripes[si->cur_pg];
+	_1ps->pages[si->cur_comp] = page;
+	++_1ps->write_count;
+
+	si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+	/* si->cur_comp is advanced outside at main loop */
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		     bool not_last)
+{
+	struct osd_sg_entry *sge;
+
+	ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+		     "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+		     per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+		     _LLU(per_dev->offset), per_dev->length,
+		     per_dev->last_sgs_total);
+
+	if (!per_dev->cur_sg) {
+		sge = per_dev->sglist;
+
+		/* First time we prepare two entries */
+		if (per_dev->length) {
+			++per_dev->cur_sg;
+			sge->offset = per_dev->offset;
+			sge->len = per_dev->length;
+		} else {
+			/* Here the parity is the first unit of this object.
+			 * This happens every time we reach a parity device on
+			 * the same stripe as the per_dev->offset. We need to
+			 * just skip this unit.
+			 */
+			per_dev->offset += cur_len;
+			return;
+		}
+	} else {
+		/* finalize the last one */
+		sge = &per_dev->sglist[per_dev->cur_sg - 1];
+		sge->len = per_dev->length - per_dev->last_sgs_total;
+	}
+
+	if (not_last) {
+		/* Partly prepare the next one */
+		struct osd_sg_entry *next_sge = sge + 1;
+
+		++per_dev->cur_sg;
+		next_sge->offset = sge->offset + sge->len + cur_len;
+		/* Save cur len so we know how mutch was added next time */
+		per_dev->last_sgs_total = per_dev->length;
+		next_sge->len = 0;
+	} else if (!sge->len) {
+		/* Optimize for when the last unit is a parity */
+		--per_dev->cur_sg;
+	}
+}
+
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+	struct ore_layout *layout = ios->layout;
+	int ret;
+	/* We want to only read those pages not in cache so worst case
+	 * is a stripe populated with every other page
+	 */
+	unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+	ret = _ore_get_io_state(layout, ios->oc,
+				layout->group_width * layout->mirrors_p1,
+				sgs_per_dev, 0, &ios->ios_read_4_write);
+	return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_read_4_write(struct ore_io_state *ios,
+				struct ore_striping_info *si, struct page *page)
+{
+	struct request_queue *q;
+	struct ore_per_dev_state *per_dev;
+	struct ore_io_state *read_ios;
+	unsigned first_dev = si->dev - (si->dev %
+			  (ios->layout->group_width * ios->layout->mirrors_p1));
+	unsigned comp = si->dev - first_dev;
+	unsigned added_len;
+
+	if (!ios->ios_read_4_write) {
+		int ret = _alloc_read_4_write(ios);
+
+		if (unlikely(ret))
+			return ret;
+	}
+
+	read_ios = ios->ios_read_4_write;
+	read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+	per_dev = &read_ios->per_dev[comp];
+	if (!per_dev->length) {
+		per_dev->bio = bio_kmalloc(GFP_KERNEL,
+					   ios->sp2d->pages_in_unit);
+		if (unlikely(!per_dev->bio)) {
+			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+				     ios->sp2d->pages_in_unit);
+			return -ENOMEM;
+		}
+		per_dev->offset = si->obj_offset;
+		per_dev->dev = si->dev;
+	} else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+		u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+		_ore_add_sg_seg(per_dev, gap, true);
+	}
+	q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+	added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+	if (unlikely(added_len != PAGE_SIZE)) {
+		ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+			      per_dev->bio->bi_vcnt);
+		return -ENOMEM;
+	}
+
+	per_dev->length += PAGE_SIZE;
+	return 0;
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+	struct bio_vec *bv;
+	unsigned i, d;
+
+	/* loop on all devices all pages */
+	for (d = 0; d < ios->numdevs; d++) {
+		struct bio *bio = ios->per_dev[d].bio;
+
+		if (!bio)
+			continue;
+
+		__bio_for_each_segment(bv, bio, i, 0) {
+			struct page *page = bv->bv_page;
+
+			SetPageUptodate(page);
+			if (PageError(page))
+				ClearPageError(page);
+		}
+	}
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write(struct ore_io_state *ios)
+{
+	struct ore_io_state *ios_read;
+	struct ore_striping_info read_si;
+	struct __stripe_pages_2d *sp2d = ios->sp2d;
+	u64 offset = ios->si.first_stripe_start;
+	u64 last_stripe_end;
+	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+	unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+	int ret;
+
+	if (offset == ios->offset) /* Go to start collect $200 */
+		goto read_last_stripe;
+
+	min_p = _sp2d_min_pg(sp2d);
+	max_p = _sp2d_max_pg(sp2d);
+
+	for (c = 0; ; c++) {
+		ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		read_si.obj_offset += min_p * PAGE_SIZE;
+		offset += min_p * PAGE_SIZE;
+		for (p = min_p; p <= max_p; p++) {
+			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+			struct page **pp = &_1ps->pages[c];
+			bool uptodate;
+
+			if (*pp)
+				/* to-be-written pages start here */
+				goto read_last_stripe;
+
+			*pp = ios->r4w->get_page(ios->private, offset,
+						 &uptodate);
+			if (unlikely(!*pp))
+				return -ENOMEM;
+
+			if (!uptodate)
+				_add_to_read_4_write(ios, &read_si, *pp);
+
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			read_si.obj_offset += PAGE_SIZE;
+			offset += PAGE_SIZE;
+		}
+		offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+	}
+
+read_last_stripe:
+	offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+				PAGE_SIZE * PAGE_SIZE;
+	last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+				 * bytes_in_stripe;
+	if (offset == last_stripe_end) /* Optimize for the aligned case */
+		goto read_it;
+
+	ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+	p = read_si.unit_off / PAGE_SIZE;
+	c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+		       ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+
+	BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
+	/* unaligned IO must be within a single stripe */
+
+	if (min_p == sp2d->pages_in_unit) {
+		/* Didn't do it yet */
+		min_p = _sp2d_min_pg(sp2d);
+		max_p = _sp2d_max_pg(sp2d);
+	}
+
+	while (offset < last_stripe_end) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if ((min_p <= p) && (p <= max_p)) {
+			struct page *page;
+			bool uptodate;
+
+			BUG_ON(_1ps->pages[c]);
+			page = ios->r4w->get_page(ios->private, offset,
+						  &uptodate);
+			if (unlikely(!page))
+				return -ENOMEM;
+
+			_1ps->pages[c] = page;
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			if (!uptodate)
+				_add_to_read_4_write(ios, &read_si, page);
+		}
+
+		offset += PAGE_SIZE;
+		if (p == (sp2d->pages_in_unit - 1)) {
+			++c;
+			p = 0;
+			ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		} else {
+			read_si.obj_offset += PAGE_SIZE;
+			++p;
+		}
+	}
+
+read_it:
+	ios_read = ios->ios_read_4_write;
+	if (!ios_read)
+		return 0;
+
+	/* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+	 * to check for per_dev->bio
+	 */
+	ios_read->pages = ios->pages;
+
+	/* Now read these devices */
+	for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+		ret = _ore_read_mirror(ios_read, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = ore_io_execute(ios_read); /* Synchronus execution */
+	if (unlikely(ret)) {
+		ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+		return ret;
+	}
+
+	_mark_read4write_pages_uptodate(ios_read, ret);
+	return 0;
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+			    struct ore_striping_info *si,
+			    struct ore_per_dev_state *per_dev,
+			    unsigned cur_len)
+{
+	if (ios->reading) {
+		BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+		_ore_add_sg_seg(per_dev, cur_len, true);
+	} else {
+		struct __stripe_pages_2d *sp2d = ios->sp2d;
+		struct page **pages = ios->parity_pages + ios->cur_par_page;
+		unsigned num_pages;
+		unsigned array_start = 0;
+		unsigned i;
+		int ret;
+
+		si->cur_pg = _sp2d_min_pg(sp2d);
+		num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+		if (!cur_len) /* If last stripe operate on parity comp */
+			si->cur_comp = sp2d->data_devs;
+
+		if (!per_dev->length) {
+			per_dev->offset += si->cur_pg * PAGE_SIZE;
+			/* If first stripe, Read in all read4write pages
+			 * (if needed) before we calculate the first parity.
+			 */
+			_read_4_write(ios);
+		}
+
+		for (i = 0; i < num_pages; i++) {
+			pages[i] = _raid_page_alloc();
+			if (unlikely(!pages[i]))
+				return -ENOMEM;
+
+			++(ios->cur_par_page);
+		}
+
+		BUG_ON(si->cur_comp != sp2d->data_devs);
+		BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+
+		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
+					   per_dev, num_pages * PAGE_SIZE);
+		if (unlikely(ret))
+			return ret;
+
+		/* TODO: raid6 if (last_parity_dev) */
+		_gen_xor_unit(sp2d);
+		_sp2d_reset(sp2d, ios->r4w, ios->private);
+	}
+	return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+	struct ore_layout *layout = ios->layout;
+
+	if (ios->parity_pages) {
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		unsigned stripe_size = ios->si.bytes_in_stripe;
+		u64 last_stripe, first_stripe;
+
+		if (_sp2d_alloc(pages_in_unit, layout->group_width,
+				layout->parity, &ios->sp2d)) {
+			return -ENOMEM;
+		}
+
+		BUG_ON(ios->offset % PAGE_SIZE);
+
+		/* Round io down to last full strip */
+		first_stripe = div_u64(ios->offset, stripe_size);
+		last_stripe = div_u64(ios->offset + ios->length, stripe_size);
+
+		/* If an IO spans more then a single stripe it must end at
+		 * a stripe boundary. The reminder at the end is pushed into the
+		 * next IO.
+		 */
+		if (last_stripe != first_stripe) {
+			ios->length = last_stripe * stripe_size - ios->offset;
+
+			BUG_ON(!ios->length);
+			ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
+					PAGE_SIZE;
+			ios->si.length = ios->length; /*make it consistent */
+		}
+	}
+	return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+	if (ios->sp2d) { /* writing and raid */
+		unsigned i;
+
+		for (i = 0; i < ios->cur_par_page; i++) {
+			struct page *page = ios->parity_pages[i];
+
+			if (page)
+				_raid_page_free(page);
+		}
+		if (ios->extra_part_alloc)
+			kfree(ios->parity_pages);
+		/* If IO returned an error pages might need unlocking */
+		_sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+		_sp2d_free(ios->sp2d);
+	} else {
+		/* Will only be set if raid reading && sglist is big */
+		if (ios->extra_part_alloc)
+			kfree(ios->per_dev[0].sglist);
+	}
+	if (ios->ios_read_4_write)
+		ore_put_io_state(ios->ios_read_4_write);
+}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 00000000000..2ffd2c3c6e4
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* Calculate the component order in a stripe. eg the logical data unit
+ * address within the stripe of @dev given the @par_dev of this stripe.
+ */
+static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
+				  unsigned par_dev, unsigned dev)
+{
+	unsigned first_dev = dev - dev % devs_in_group;
+
+	dev -= first_dev;
+	par_dev -= first_dev;
+
+	if (devs_in_group == par_dev) /* The raid 0 case */
+		return dev / mirrors_p1;
+	/* raid4/5/6 case */
+	return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
+	       mirrors_p1;
+}
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		 bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+		     struct ore_per_dev_state *per_dev, unsigned cur_len);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+				struct ore_striping_info *si, struct page *page)
+{
+	if (!sp2d) /* Inline the fast path */
+		return; /* Hay no raid stuff */
+	_ore_add_stripe_page(sp2d, si, page);
+}
+
+/* ios.c stuff needed by ios_raid.c */
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct page **pages,
+		struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b0..057b237b8b6 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
 	struct ore_io_state *ios;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
 	struct ore_io_state *ios;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
@@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops;
 /*
  * Write the superblock to the OSD
  */
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
 	struct ore_comp one_comp;
-	struct ore_components comps;
+	struct ore_components oc;
 	struct ore_io_state *ios;
 	int ret = -ENOMEM;
 
@@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	 * the writeable info is set in exofs_sbi_write_stats() above.
 	 */
 
-	exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+	exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
 
-	ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oc, &ios);
 	if (unlikely(ret))
 		goto out;
 
@@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 		msg, dev_path ?: "", odi->osdname, _LLU(pid));
 }
 
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->comps.numdevs) {
-		int i = --sbi->comps.numdevs;
-		struct osd_dev *od = sbi->comps.ods[i];
+	unsigned numdevs = sbi->oc.numdevs;
+
+	while (numdevs) {
+		unsigned i = --numdevs;
+		struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
 
 		if (od) {
-			sbi->comps.ods[i] = NULL;
+			ore_comp_set_dev(&sbi->oc, i, NULL);
 			osduld_put_device(od);
 		}
 	}
-	if (sbi->comps.ods != sbi->_min_one_dev)
-		kfree(sbi->comps.ods);
+	kfree(sbi->oc.ods);
 	kfree(sbi);
 }
 
@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 
 	bdi_destroy(&sbi->bdi);
@@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb)
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 				    struct exofs_device_table *dt)
 {
-	u64 stripe_length;
+	int ret;
 
-	sbi->data_map.odm_num_comps   =
-				le32_to_cpu(dt->dt_data_map.cb_num_comps);
-	sbi->data_map.odm_stripe_unit =
+	sbi->layout.stripe_unit =
 				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
-	sbi->data_map.odm_group_width =
+	sbi->layout.group_width =
 				le32_to_cpu(dt->dt_data_map.cb_group_width);
-	sbi->data_map.odm_group_depth =
+	sbi->layout.group_depth =
 				le32_to_cpu(dt->dt_data_map.cb_group_depth);
-	sbi->data_map.odm_mirror_cnt  =
-				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
-	sbi->data_map.odm_raid_algorithm  =
+	sbi->layout.mirrors_p1  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+	sbi->layout.raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Only raid0 for now. if not so, do not mount */
-	if (sbi->data_map.odm_num_comps != numdevs) {
-		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
-			  sbi->data_map.odm_num_comps, numdevs);
-		return -EINVAL;
-	}
-	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
-		EXOFS_ERR("Only RAID_0 for now\n");
-		return -EINVAL;
-	}
-	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
-		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
-			  numdevs, sbi->data_map.odm_mirror_cnt);
-		return -EINVAL;
-	}
-
-	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
-		EXOFS_ERR("Stripe Unit(0x%llx)"
-			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
-		return -EINVAL;
-	}
-
-	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
-	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
-	if (sbi->data_map.odm_group_width) {
-		sbi->layout.group_width = sbi->data_map.odm_group_width;
-		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
-		if (!sbi->layout.group_depth) {
-			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
-			return -EINVAL;
-		}
-		sbi->layout.group_count = sbi->data_map.odm_num_comps /
-						sbi->layout.mirrors_p1 /
-						sbi->data_map.odm_group_width;
-	} else {
-		if (sbi->data_map.odm_group_depth) {
-			printk(KERN_NOTICE "Warning: group_depth ignored "
-				"group_width == 0 && group_depth == %d\n",
-				sbi->data_map.odm_group_depth);
-			sbi->data_map.odm_group_depth = 0;
-		}
-		sbi->layout.group_width = sbi->data_map.odm_num_comps /
-							sbi->layout.mirrors_p1;
-		sbi->layout.group_depth = -1;
-		sbi->layout.group_count = 1;
-	}
-
-	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
-	if (stripe_length >= (1ULL << 32)) {
-		EXOFS_ERR("Total Stripe length(0x%llx)"
-			  " >= 32bit is not supported\n", _LLU(stripe_length));
-		return -EINVAL;
-	}
+	ret = ore_verify_layout(numdevs, &sbi->layout);
 
 	EXOFS_DBGMSG("exofs: layout: "
 		"num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 		sbi->layout.group_width,
 		_LLU(sbi->layout.group_depth),
 		sbi->layout.mirrors_p1,
-		sbi->data_map.odm_raid_algorithm);
-	return 0;
+		sbi->layout.raid_algorithm);
+	return ret;
 }
 
 static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 	return !(odi->systemid_len || odi->osdname_len);
 }
 
+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+		      struct exofs_dev **peds)
+{
+	struct __alloc_ore_devs_and_exofs_devs {
+		/* Twice bigger table: See exofs_init_comps() and comment at
+		 * exofs_read_lookup_dev_table()
+		 */
+		struct ore_dev *oreds[numdevs * 2 - 1];
+		struct exofs_dev eds[numdevs];
+	} *aoded;
+	struct exofs_dev *eds;
+	unsigned i;
+
+	aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+	if (unlikely(!aoded)) {
+		EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+			  numdevs);
+		return -ENOMEM;
+	}
+
+	sbi->oc.ods = aoded->oreds;
+	*peds = eds = aoded->eds;
+	for (i = 0; i < numdevs; ++i)
+		aoded->oreds[i] = &eds[i].ored;
+	return 0;
+}
+
 static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 				       struct osd_dev *fscb_od,
 				       unsigned table_count)
 {
 	struct ore_comp comp;
 	struct exofs_device_table *dt;
+	struct exofs_dev *eds;
 	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
 					     sizeof(*dt);
 	unsigned numdevs, i;
@@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 		return -ENOMEM;
 	}
 
-	sbi->comps.numdevs = 0;
+	sbi->oc.numdevs = 0;
 
 	comp.obj.partition = sbi->one_comp.obj.partition;
 	comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 	if (unlikely(ret))
 		goto out;
 
-	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
-
-		/* Twice bigger table: See exofs_init_comps() and below
-		 * comment
-		 */
-		sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
-		if (unlikely(!sbi->comps.ods)) {
-			EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
-				  numdevs);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	ret = __alloc_dev_table(sbi, numdevs, &eds);
+	if (unlikely(ret))
+		goto out;
+	/* exofs round-robins the device table view according to inode
+	 * number. We hold a: twice bigger table hence inodes can point
+	 * to any device and have a sequential view of the table
+	 * starting at this device. See exofs_init_comps()
+	 */
+	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
 
 	for (i = 0; i < numdevs; i++) {
 		struct exofs_fscb fscb;
@@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
 		       i, odi.osdname);
 
+		/* the exofs id is currently the table index */
+		eds[i].did = i;
+
 		/* On all devices the device table is identical. The user can
 		 * specify any one of the participating devices on the command
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->comps.ods[i] = fscb_od;
-			++sbi->comps.numdevs;
+			eds[i].ored.od = fscb_od;
+			++sbi->oc.numdevs;
 			fscb_od = NULL;
 			continue;
 		}
@@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 			goto out;
 		}
 
-		sbi->comps.ods[i] = od;
-		++sbi->comps.numdevs;
+		eds[i].ored.od = od;
+		++sbi->oc.numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
 		 * partition is there.
@@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 
 out:
 	kfree(dt);
-	if (likely(!ret)) {
-		unsigned numdevs = sbi->comps.numdevs;
-
-		if (unlikely(fscb_od)) {
+	if (unlikely(fscb_od && !ret)) {
 			EXOFS_ERR("ERROR: Bad device-table container device not present\n");
 			osduld_put_device(fscb_od);
 			return -EINVAL;
-		}
-		/* exofs round-robins the device table view according to inode
-		 * number. We hold a: twice bigger table hence inodes can point
-		 * to any device and have a sequential view of the table
-		 * starting at this device. See exofs_init_comps()
-		 */
-		for (i = 0; i < numdevs - 1; ++i)
-			sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
 	}
 	return ret;
 }
@@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->one_comp.obj.partition = opts->pid;
 	sbi->one_comp.obj.id = 0;
 	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-	sbi->comps.numdevs = 1;
-	sbi->comps.single_comp = EC_SINGLE_COMP;
-	sbi->comps.comps = &sbi->one_comp;
-	sbi->comps.ods = sbi->_min_one_dev;
+	sbi->oc.numdevs = 1;
+	sbi->oc.single_comp = EC_SINGLE_COMP;
+	sbi->oc.comps = &sbi->one_comp;
 
 	/* fill in some other data by hand */
 	memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		if (unlikely(ret))
 			goto free_sbi;
 	} else {
-		sbi->comps.ods[0] = od;
+		struct exofs_dev *eds;
+
+		ret = __alloc_dev_table(sbi, 1, &eds);
+		if (unlikely(ret))
+			goto free_sbi;
+
+		ore_comp_set_dev(&sbi->oc, 0, od);
 	}
 
 	__sbi_read_stats(sbi);
@@ -875,7 +841,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+	_exofs_print_device("Mounting", opts->dev_name,
+			    ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 	return 0;
 
@@ -924,7 +891,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	uint64_t used = ULLONG_MAX;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (ret) {
 		EXOFS_DBGMSG("ore_get_io_state failed.\n");
 		return ret;
@@ -981,7 +948,7 @@ static const struct super_operations exofs_sops = {
  * EXPORT OPERATIONS
  *****************************************************************************/
 
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
 {
 	unsigned long ino = exofs_parent_ino(child);
 
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 5d979b4347b..c922adc8ef4 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
 			      value, size, flags);
 }
 
-int
-ext2_init_security(struct inode *inode, struct inode *dir,
-		   const struct qstr *qstr)
+int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		    void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *name;
+	const struct xattr *xattr;
+	int err = 0;
 
-	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+				     xattr->name, xattr->value,
+				     xattr->value_len, 0);
+		if (err < 0)
+			break;
 	}
-	err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
-			     name, value, len, 0);
-	kfree(name);
-	kfree(value);
 	return err;
 }
 
+int
+ext2_init_security(struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext2_initxattrs, NULL);
+}
+
 const struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext2_xattr_security_list,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b8d9f83aa5c..3c218b8a51d 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
-		   const struct qstr *qstr)
+int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		    void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *name;
+	const struct xattr *xattr;
+	handle_t *handle = fs_info;
+	int err = 0;
 
-	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext3_xattr_set_handle(handle, inode,
+					    EXT3_XATTR_INDEX_SECURITY,
+					    xattr->name, xattr->value,
+					    xattr->value_len, 0);
+		if (err < 0)
+			break;
 	}
-	err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
-				    name, value, len, 0);
-	kfree(name);
-	kfree(value);
 	return err;
 }
 
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext3_initxattrs, handle);
+}
+
 const struct xattr_handler ext3_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext3_xattr_security_list,
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 007c3bfbf09..34e4350dd4d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
-		   const struct qstr *qstr)
+int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		    void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *name;
+	const struct xattr *xattr;
+	handle_t *handle = fs_info;
+	int err = 0;
 
-	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext4_xattr_set_handle(handle, inode,
+					    EXT4_XATTR_INDEX_SECURITY,
+					    xattr->name, xattr->value,
+					    xattr->value_len, 0);
+		if (err < 0)
+			break;
 	}
-	err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
-				    name, value, len, 0);
-	kfree(name);
-	kfree(value);
 	return err;
 }
 
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext4_initxattrs, handle);
+}
+
 const struct xattr_handler ext4_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext4_xattr_security_list,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 900cf986aad..6525b804d5e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -624,31 +624,29 @@ fail:
 	return error;
 }
 
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
-			      const struct qstr *qstr)
+int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		    void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *name;
-
-	err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
-					   &name, &value, &len);
-
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
+				       xattr->value_len, 0,
+				       GFS2_EATYPE_SECURITY);
+		if (err < 0)
+			break;
 	}
-
-	err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
-			       GFS2_EATYPE_SECURITY);
-	kfree(value);
-	kfree(name);
-
 	return err;
 }
 
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+			      const struct qstr *qstr)
+{
+	return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+					    &gfs2_initxattrs, NULL);
+}
+
 /**
  * gfs2_create_inode - Create a new inode
  * @dir: The parent directory
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index cfeb7164b08..0f20208df60 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -22,26 +22,29 @@
 #include <linux/security.h>
 #include "nodelist.h"
 
-/* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir,
-			const struct qstr *qstr)
+/* ---- Initial Security Label(s) Attachment callback --- */
+int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		     void *fs_info)
 {
-	int rc;
-	size_t len;
-	void *value;
-	char *name;
+	const struct xattr *xattr;
+	int err = 0;
 
-	rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
-	if (rc) {
-		if (rc == -EOPNOTSUPP)
-			return 0;
-		return rc;
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
+					xattr->name, xattr->value,
+					xattr->value_len, 0);
+		if (err < 0)
+			break;
 	}
-	rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+	return err;
+}
 
-	kfree(name);
-	kfree(value);
-	return rc;
+/* ---- Initial Security Label(s) Attachment ----------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+			const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &jffs2_initxattrs, NULL);
 }
 
 /* ---- XATTR Handler for "security.*" ----------------- */
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index e87fedef23d..26683e15b3a 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 
 #ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
-		      const struct qstr *qstr)
+int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		   void *fs_info)
 {
-	int rc;
-	size_t len;
-	void *value;
-	char *suffix;
+	const struct xattr *xattr;
+	tid_t *tid = fs_info;
 	char *name;
-
-	rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-					  &len);
-	if (rc) {
-		if (rc == -EOPNOTSUPP)
-			return 0;
-		return rc;
-	}
-	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
-		       GFP_NOFS);
-	if (!name) {
-		rc = -ENOMEM;
-		goto kmalloc_failed;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+			       strlen(xattr->name) + 1, GFP_NOFS);
+		if (!name) {
+			err = -ENOMEM;
+			break;
+		}
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+		err = __jfs_setxattr(*tid, inode, name,
+				     xattr->value, xattr->value_len, 0);
+		kfree(name);
+		if (err < 0)
+			break;
 	}
-	strcpy(name, XATTR_SECURITY_PREFIX);
-	strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-
-	rc = __jfs_setxattr(tid, inode, name, value, len, 0);
-
-	kfree(name);
-kmalloc_failed:
-	kfree(suffix);
-	kfree(value);
+	return err;
+}
 
-	return rc;
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+		      const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &jfs_initxattrs, &tid);
 }
 #endif
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da..6f29836ec0c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 	struct hlist_node *pos;
 	struct nlm_host	*host = NULL;
 	struct nsm_handle *nsm = NULL;
-	struct sockaddr_in sin = {
-		.sin_family	= AF_INET,
-	};
-	struct sockaddr_in6 sin6 = {
-		.sin6_family	= AF_INET6,
-	};
-	struct sockaddr *src_sap;
-	size_t src_len = rqstp->rq_addrlen;
+	struct sockaddr *src_sap = svc_daddr(rqstp);
+	size_t src_len = rqstp->rq_daddrlen;
 	struct nlm_lookup_host_info ni = {
 		.server		= 1,
 		.sap		= svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 
 	mutex_lock(&nlm_host_mutex);
 
-	switch (ni.sap->sa_family) {
-	case AF_INET:
-		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-		src_sap = (struct sockaddr *)&sin;
-		break;
-	case AF_INET6:
-		ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-		src_sap = (struct sockaddr *)&sin6;
-		break;
-	default:
-		dprintk("lockd: %s failed; unrecognized address family\n",
-			__func__);
-		goto out;
-	}
-
 	if (time_after_eq(jiffies, next_gc))
 		nlm_gc_hosts();
 
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979..c061b9aa7dd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
 	/*
 	 * Create the kernel thread and wait for it to start.
 	 */
-	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 	if (IS_ERR(nlmsvc_rqst)) {
 		error = PTR_ERR(nlmsvc_rqst);
 		nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 703f545097d..3b0d05dcd7c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -60,7 +60,7 @@
  *
  *  Initial implementation of mandatory locks. SunOS turned out to be
  *  a rotten model, so I implemented the "obvious" semantics.
- *  See 'Documentation/mandatory.txt' for details.
+ *  See 'Documentation/filesystems/mandatory-locking.txt' for details.
  *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
  *
  *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
@@ -133,6 +133,20 @@
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)	(fl->fl_flags & FL_LEASE)
 
+static bool lease_breaking(struct file_lock *fl)
+{
+	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+}
+
+static int target_leasetype(struct file_lock *fl)
+{
+	if (fl->fl_flags & FL_UNLOCK_PENDING)
+		return F_UNLCK;
+	if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+		return F_RDLCK;
+	return fl->fl_type;
+}
+
 int leases_enable = 1;
 int lease_break_time = 45;
 
@@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 
 EXPORT_SYMBOL(locks_mandatory_area);
 
+static void lease_clear_pending(struct file_lock *fl, int arg)
+{
+	switch (arg) {
+	case F_UNLCK:
+		fl->fl_flags &= ~FL_UNLOCK_PENDING;
+		/* fall through: */
+	case F_RDLCK:
+		fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+	}
+}
+
 /* We already had a lease on this file; just change its type */
 int lease_modify(struct file_lock **before, int arg)
 {
@@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
 
 	if (error)
 		return error;
+	lease_clear_pending(fl, arg);
 	locks_wake_up_blocks(fl);
 	if (arg == F_UNLCK)
 		locks_delete_lock(before);
@@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg)
 
 EXPORT_SYMBOL(lease_modify);
 
+static bool past_time(unsigned long then)
+{
+	if (!then)
+		/* 0 is a special value meaning "this never expires": */
+		return false;
+	return time_after(jiffies, then);
+}
+
 static void time_out_leases(struct inode *inode)
 {
 	struct file_lock **before;
 	struct file_lock *fl;
 
 	before = &inode->i_flock;
-	while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) {
-		if ((fl->fl_break_time == 0)
-				|| time_before(jiffies, fl->fl_break_time)) {
-			before = &fl->fl_next;
-			continue;
-		}
-		lease_modify(before, fl->fl_type & ~F_INPROGRESS);
+	while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+		if (past_time(fl->fl_downgrade_time))
+			lease_modify(before, F_RDLCK);
+		if (past_time(fl->fl_break_time))
+			lease_modify(before, F_UNLCK);
 		if (fl == *before)	/* lease_modify may have freed fl */
 			before = &fl->fl_next;
 	}
@@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
  */
 int __break_lease(struct inode *inode, unsigned int mode)
 {
-	int error = 0, future;
+	int error = 0;
 	struct file_lock *new_fl, *flock;
 	struct file_lock *fl;
 	unsigned long break_time;
@@ -1182,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	if ((flock == NULL) || !IS_LEASE(flock))
 		goto out;
 
+	if (!locks_conflict(flock, new_fl))
+		goto out;
+
 	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
 		if (fl->fl_owner == current->files)
 			i_have_this_lease = 1;
 
-	if (want_write) {
-		/* If we want write access, we have to revoke any lease. */
-		future = F_UNLCK | F_INPROGRESS;
-	} else if (flock->fl_type & F_INPROGRESS) {
-		/* If the lease is already being broken, we just leave it */
-		future = flock->fl_type;
-	} else if (flock->fl_type & F_WRLCK) {
-		/* Downgrade the exclusive lease to a read-only lease. */
-		future = F_RDLCK | F_INPROGRESS;
-	} else {
-		/* the existing lease was read-only, so we can read too. */
-		goto out;
-	}
-
 	if (IS_ERR(new_fl) && !i_have_this_lease
 			&& ((mode & O_NONBLOCK) == 0)) {
 		error = PTR_ERR(new_fl);
@@ -1214,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	}
 
 	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
-		if (fl->fl_type != future) {
-			fl->fl_type = future;
+		if (want_write) {
+			if (fl->fl_flags & FL_UNLOCK_PENDING)
+				continue;
+			fl->fl_flags |= FL_UNLOCK_PENDING;
 			fl->fl_break_time = break_time;
-			/* lease must have lmops break callback */
-			fl->fl_lmops->lm_break(fl);
+		} else {
+			if (lease_breaking(flock))
+				continue;
+			fl->fl_flags |= FL_DOWNGRADE_PENDING;
+			fl->fl_downgrade_time = break_time;
 		}
+		fl->fl_lmops->lm_break(fl);
 	}
 
 	if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1243,10 +1270,13 @@ restart:
 	if (error >= 0) {
 		if (error == 0)
 			time_out_leases(inode);
-		/* Wait for the next lease that has not been broken yet */
+		/*
+		 * Wait for the next conflicting lease that has not been
+		 * broken yet
+		 */
 		for (flock = inode->i_flock; flock && IS_LEASE(flock);
 				flock = flock->fl_next) {
-			if (flock->fl_type & F_INPROGRESS)
+			if (locks_conflict(new_fl, flock))
 				goto restart;
 		}
 		error = 0;
@@ -1314,7 +1344,7 @@ int fcntl_getlease(struct file *filp)
 	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
 		if (fl->fl_file == filp) {
-			type = fl->fl_type & ~F_INPROGRESS;
+			type = target_leasetype(fl);
 			break;
 		}
 	}
@@ -1322,50 +1352,23 @@ int fcntl_getlease(struct file *filp)
 	return type;
 }
 
-/**
- *	generic_setlease	-	sets a lease on an open file
- *	@filp: file pointer
- *	@arg: type of lease to obtain
- *	@flp: input - file_lock to use, output - file_lock inserted
- *
- *	The (input) flp->fl_lmops->lm_break function is required
- *	by break_lease().
- *
- *	Called with file_lock_lock held.
- */
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	int error, rdlease_count = 0, wrlease_count = 0;
+	int error;
 
 	lease = *flp;
 
-	error = -EACCES;
-	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
-		goto out;
-	error = -EINVAL;
-	if (!S_ISREG(inode->i_mode))
+	error = -EAGAIN;
+	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		goto out;
-	error = security_file_lock(filp, arg);
-	if (error)
+	if ((arg == F_WRLCK)
+	    && ((dentry->d_count > 1)
+		|| (atomic_read(&inode->i_count) > 1)))
 		goto out;
 
-	time_out_leases(inode);
-
-	BUG_ON(!(*flp)->fl_lmops->lm_break);
-
-	if (arg != F_UNLCK) {
-		error = -EAGAIN;
-		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
-			goto out;
-		if ((arg == F_WRLCK)
-		    && ((dentry->d_count > 1)
-			|| (atomic_read(&inode->i_count) > 1)))
-			goto out;
-	}
-
 	/*
 	 * At this point, we know that if there is an exclusive
 	 * lease on this file, then we hold it on this filp
@@ -1374,27 +1377,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	 * then the file is not open by anyone (including us)
 	 * except for this filp.
 	 */
+	error = -EAGAIN;
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
-		if (fl->fl_file == filp)
+		if (fl->fl_file == filp) {
 			my_before = before;
-		else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
-			/*
-			 * Someone is in the process of opening this
-			 * file for writing so we may not take an
-			 * exclusive lease on it.
-			 */
-			wrlease_count++;
-		else
-			rdlease_count++;
+			continue;
+		}
+		/*
+		 * No exclusive leases if someone else has a lease on
+		 * this file:
+		 */
+		if (arg == F_WRLCK)
+			goto out;
+		/*
+		 * Modifying our existing lease is OK, but no getting a
+		 * new lease if someone else is opening for write:
+		 */
+		if (fl->fl_flags & FL_UNLOCK_PENDING)
+			goto out;
 	}
 
-	error = -EAGAIN;
-	if ((arg == F_RDLCK && (wrlease_count > 0)) ||
-	    (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
-		goto out;
-
 	if (my_before != NULL) {
 		error = lease->fl_lmops->lm_change(my_before, arg);
 		if (!error)
@@ -1402,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		goto out;
 	}
 
-	if (arg == F_UNLCK)
-		goto out;
-
 	error = -EINVAL;
 	if (!leases_enable)
 		goto out;
@@ -1415,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 out:
 	return error;
 }
+
+int generic_delete_lease(struct file *filp, struct file_lock **flp)
+{
+	struct file_lock *fl, **before;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+
+	for (before = &inode->i_flock;
+			((fl = *before) != NULL) && IS_LEASE(fl);
+			before = &fl->fl_next) {
+		if (fl->fl_file != filp)
+			continue;
+		return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
+	}
+	return -EAGAIN;
+}
+
+/**
+ *	generic_setlease	-	sets a lease on an open file
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *	@flp: input - file_lock to use, output - file_lock inserted
+ *
+ *	The (input) flp->fl_lmops->lm_break function is required
+ *	by break_lease().
+ *
+ *	Called with file_lock_lock held.
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+		return -EACCES;
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+	error = security_file_lock(filp, arg);
+	if (error)
+		return error;
+
+	time_out_leases(inode);
+
+	BUG_ON(!(*flp)->fl_lmops->lm_break);
+
+	switch (arg) {
+	case F_UNLCK:
+		return generic_delete_lease(filp, flp);
+	case F_RDLCK:
+	case F_WRLCK:
+		return generic_add_lease(filp, arg, flp);
+	default:
+		BUG();
+	}
+}
 EXPORT_SYMBOL(generic_setlease);
 
 static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -2126,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 		}
 	} else if (IS_LEASE(fl)) {
 		seq_printf(f, "LEASE  ");
-		if (fl->fl_type & F_INPROGRESS)
+		if (lease_breaking(fl))
 			seq_printf(f, "BREAKING  ");
 		else if (fl->fl_file)
 			seq_printf(f, "ACTIVE    ");
@@ -2142,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
 	} else {
 		seq_printf(f, "%s ",
-			       (fl->fl_type & F_INPROGRESS)
+			       (lease_breaking(fl))
 			       ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
 			       : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
 	}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9561c8fc8bd..281ae95932c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -176,17 +176,6 @@ retry:
 	return bio;
 }
 
-static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-	if (lseg->pls_range.iomode == IOMODE_RW) {
-		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-	} else {
-		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-	}
-}
-
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!rdata->pnfs_error)
 			rdata->pnfs_error = -EIO;
-		bl_set_lo_fail(rdata->lseg);
+		pnfs_set_lo_fail(rdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -303,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 						 bl_end_io_read, par);
 			if (IS_ERR(bio)) {
 				rdata->pnfs_error = PTR_ERR(bio);
+				bio = NULL;
 				goto out;
 			}
 		}
@@ -370,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
-		bl_set_lo_fail(wdata->lseg);
+		pnfs_set_lo_fail(wdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -386,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
-		bl_set_lo_fail(wdata->lseg);
+		pnfs_set_lo_fail(wdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -543,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 fill_invalid_ext:
 		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
 		for (;npg_zero > 0; npg_zero--) {
+			if (bl_is_sector_init(be->be_inval, isect)) {
+				dprintk("isect %llu already init\n",
+					(unsigned long long)isect);
+				goto next_page;
+			}
 			/* page ref released in bl_end_io_write_zero */
 			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
 			dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -562,8 +557,7 @@ fill_invalid_ext:
 			 * PageUptodate: It was read before
 			 * sector_initialized: already written out
 			 */
-			if (PageDirty(page) || PageWriteback(page) ||
-			    bl_is_sector_init(be->be_inval, isect)) {
+			if (PageDirty(page) || PageWriteback(page)) {
 				print_page(page);
 				unlock_page(page);
 				page_cache_release(page);
@@ -592,6 +586,7 @@ fill_invalid_ext:
 						 bl_end_io_write_zero, par);
 			if (IS_ERR(bio)) {
 				wdata->pnfs_error = PTR_ERR(bio);
+				bio = NULL;
 				goto out;
 			}
 			/* FIXME: This should be done in bi_end_io */
@@ -640,6 +635,7 @@ next_page:
 					 bl_end_io_write, par);
 		if (IS_ERR(bio)) {
 			wdata->pnfs_error = PTR_ERR(bio);
+			bio = NULL;
 			goto out;
 		}
 		isect += PAGE_CACHE_SECTORS;
@@ -805,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 			struct nfs4_deviceid *d_id)
 {
 	struct pnfs_device *dev;
-	struct pnfs_block_dev *rv = NULL;
+	struct pnfs_block_dev *rv;
 	u32 max_resp_sz;
 	int max_pages;
 	struct page **pages = NULL;
@@ -823,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 	dev = kmalloc(sizeof(*dev), GFP_NOFS);
 	if (!dev) {
 		dprintk("%s kmalloc failed\n", __func__);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
 	if (pages == NULL) {
 		kfree(dev);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	for (i = 0; i < max_pages; i++) {
 		pages[i] = alloc_page(GFP_NOFS);
-		if (!pages[i])
+		if (!pages[i]) {
+			rv = ERR_PTR(-ENOMEM);
 			goto out_free;
+		}
 	}
 
 	memcpy(&dev->dev_id, d_id, sizeof(*d_id));
@@ -847,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
 	rc = nfs4_proc_getdeviceinfo(server, dev);
 	dprintk("%s getdevice info returns %d\n", __func__, rc);
-	if (rc)
+	if (rc) {
+		rv = ERR_PTR(rc);
 		goto out_free;
+	}
 
 	rv = nfs4_blk_decode_device(server, dev);
  out_free:
@@ -866,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 	struct pnfs_devicelist *dlist = NULL;
 	struct pnfs_block_dev *bdev;
 	LIST_HEAD(block_disklist);
-	int status = 0, i;
+	int status, i;
 
 	dprintk("%s enter\n", __func__);
 
@@ -898,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 		for (i = 0; i < dlist->num_devs; i++) {
 			bdev = nfs4_blk_get_deviceinfo(server, fh,
 						       &dlist->dev_id[i]);
-			if (!bdev) {
-				status = -ENODEV;
+			if (IS_ERR(bdev)) {
+				status = PTR_ERR(bdev);
 				goto out_error;
 			}
 			spin_lock(&b_mt_id->bm_lock);
@@ -960,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 };
 
 static const struct rpc_pipe_ops bl_upcall_ops = {
-	.upcall		= bl_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= bl_pipe_downcall,
 	.destroy_msg	= bl_pipe_destroy_msg,
 };
@@ -989,17 +989,20 @@ static int __init nfs4blocklayout_init(void)
 			      mnt,
 			      NFS_PIPE_DIRNAME, 0, &path);
 	if (ret)
-		goto out_remove;
+		goto out_putrpc;
 
 	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
 				    &bl_upcall_ops, 0);
+	path_put(&path);
 	if (IS_ERR(bl_device_pipe)) {
 		ret = PTR_ERR(bl_device_pipe);
-		goto out_remove;
+		goto out_putrpc;
 	}
 out:
 	return ret;
 
+out_putrpc:
+	rpc_put_mount();
 out_remove:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 	return ret;
@@ -1012,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void)
 
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 	rpc_unlink(bl_device_pipe);
+	rpc_put_mount();
 }
 
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827960a..42acf7ef599 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
 }
 
 struct bl_dev_msg {
-	int status;
+	int32_t status;
 	uint32_t major, minor;
 };
 
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
 /* blocklayoutdev.c */
-ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-		       char __user *, size_t);
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
 struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393fb01..d08ba9107fd 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
 	return blkdev_put(bdev, FMODE_READ);
 }
 
-/*
- * Shouldn't there be a rpc_generic_upcall() to do this for us?
- */
-ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		       char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len - msg->copied, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 static struct bl_dev_msg bl_mount_reply;
 
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
@@ -131,7 +109,7 @@ struct pnfs_block_dev *
 nfs4_blk_decode_device(struct nfs_server *server,
 		       struct pnfs_device *dev)
 {
-	struct pnfs_block_dev *rv = NULL;
+	struct pnfs_block_dev *rv;
 	struct block_device *bd = NULL;
 	struct rpc_pipe_msg msg;
 	struct bl_msg_hdr bl_msg = {
@@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	uint8_t *dataptr;
 	DECLARE_WAITQUEUE(wq, current);
 	struct bl_dev_msg *reply = &bl_mount_reply;
-	int offset, len, i;
+	int offset, len, i, rc;
 
 	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
 	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
 	add_wait_queue(&bl_wq, &wq);
-	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+	rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+	if (rc < 0) {
 		remove_wait_queue(&bl_wq, &wq);
+		rv = ERR_PTR(rc);
 		goto out;
 	}
 
@@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
 	if (IS_ERR(bd)) {
-		dprintk("%s failed to open device : %ld\n",
-			__func__, PTR_ERR(bd));
+		rc = PTR_ERR(bd);
+		dprintk("%s failed to open device : %d\n", __func__, rc);
+		rv = ERR_PTR(rc);
 		goto out;
 	}
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d29426905..516f3375e06 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
 	else
 		goto out_err;
 
-	return svc_prepare_thread(serv, &serv->sv_pools[0]);
+	return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 
 out_err:
 	if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 	INIT_LIST_HEAD(&serv->sv_cb_list);
 	spin_lock_init(&serv->sv_cb_lock);
 	init_waitqueue_head(&serv->sv_cb_waitq);
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 	if (IS_ERR(rqstp)) {
 		svc_xprt_put(serv->sv_bc_xprt);
 		serv->sv_bc_xprt = NULL;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b..873bf00d51a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
 	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
 	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
 
-	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-	    sin1->sin6_scope_id != sin2->sin6_scope_id)
+	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
 		return 0;
+	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		return sin1->sin6_scope_id == sin2->sin6_scope_id;
 
-	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+	return 1;
 }
 #else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 	/* display one transport per line on subsequent lines */
 	clp = list_entry(v, struct nfs_client, cl_share_link);
 
+	/* Check if the client is initialized */
+	if (clp->cl_cons_state != NFS_CS_READY)
+		return 0;
+
 	seq_printf(m, "v%u %s %s %3d %s\n",
 		   clp->rpc_ops->version,
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 321a66bc384..7f265406980 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			sizeof(delegation->stateid.data));
 	delegation->type = res->delegation_type;
 	delegation->maxsize = res->maxsize;
-	delegation->change_attr = nfsi->change_attr;
+	delegation->change_attr = inode->i_version;
 	delegation->cred = get_rpccred(cred);
 	delegation->inode = inode;
 	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 5b1006480bc..7cf2c4699b0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
 	auxdata.ctime = nfsi->vfs_inode.i_ctime;
 
 	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-		auxdata.change_attr = nfsi->change_attr;
+		auxdata.change_attr = nfsi->vfs_inode.i_version;
 
 	if (bufmax > sizeof(auxdata))
 		bufmax = sizeof(auxdata);
@@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
 	auxdata.ctime = nfsi->vfs_inode.i_ctime;
 
 	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-		auxdata.change_attr = nfsi->change_attr;
+		auxdata.change_attr = nfsi->vfs_inode.i_version;
 
 	if (memcmp(data, &auxdata, datalen) != 0)
 		return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801ae0a1..47d1c6ff2d8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
 	struct idmap_hashtable	idmap_group_hash;
 };
 
-static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-				 char __user *, size_t);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
 				   size_t);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static unsigned int fnvhash32(const void *, size_t);
 
 static const struct rpc_pipe_ops idmap_upcall_ops = {
-	.upcall		= idmap_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= idmap_pipe_downcall,
 	.destroy_msg	= idmap_pipe_destroy_msg,
 };
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 	return ret;
 }
 
-/* RPC pipefs upcall/downcall routines */
-static ssize_t
-idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		  char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe1203797b2..4dc6d078f10 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -318,7 +318,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
 		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
 		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
-		nfsi->change_attr = 0;
+		inode->i_version = 0;
 		inode->i_size = 0;
 		inode->i_nlink = 0;
 		inode->i_uid = -2;
@@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				| NFS_INO_INVALID_ACCESS
 				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-			nfsi->change_attr = fattr->change_attr;
+			inode->i_version = fattr->change_attr;
 		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
 			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
 				| NFS_INO_INVALID_DATA;
@@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
 			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-			&& nfsi->change_attr == fattr->pre_change_attr) {
-		nfsi->change_attr = fattr->change_attr;
+			&& inode->i_version == fattr->pre_change_attr) {
+		inode->i_version = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 		ret |= NFS_INO_INVALID_ATTR;
@@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		return -EIO;
 
 	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-			nfsi->change_attr != fattr->change_attr)
+			inode->i_version != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
@@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
 			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
-		fattr->pre_change_attr = NFS_I(inode)->change_attr;
+		fattr->pre_change_attr = inode->i_version;
 		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
-		if (nfsi->change_attr != fattr->change_attr) {
+		if (inode->i_version != fattr->change_attr) {
 			dprintk("NFS: change_attr change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
-			nfsi->change_attr = fattr->change_attr;
+			inode->i_version = fattr->change_attr;
 		}
 	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
 		invalid |= save_cache_validity;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913dd47..c1a1bd8ddf1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
-/*
- * Helper for restarting RPC calls in the possible presence of NFSv4.1
- * sessions.
- */
-static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
-{
-	if (nfs4_has_session(clp))
-		return rpc_restart_call_prepare(task);
-	return rpc_restart_call(task);
-}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3e93e9a1bee..693ae22f873 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
 
 struct idmap;
 
-/*
- * In a seqid-mutating op, this macro controls which error return
- * values trigger incrementation of the seqid.
- *
- * from rfc 3010:
- * The client MUST monotonically increment the sequence number for the
- * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
- * operations.  This is true even in the event that the previous
- * operation that used the sequence number received an error.  The only
- * exception to this rule is if the previous operation received one of
- * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
- * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
- * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
- *
- */
-#define seqid_mutating_err(err)       \
-(((err) != NFSERR_STALE_CLIENTID) &&  \
- ((err) != NFSERR_STALE_STATEID)  &&  \
- ((err) != NFSERR_BAD_STATEID)    &&  \
- ((err) != NFSERR_BAD_SEQID)      &&  \
- ((err) != NFSERR_BAD_XDR)        &&  \
- ((err) != NFSERR_RESOURCE)       &&  \
- ((err) != NFSERR_NOFILEHANDLE))
-
 enum nfs4_client_state {
 	NFS4CLNT_MANAGER_RUNNING  = 0,
 	NFS4CLNT_CHECK_LEASE,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4840a..09119418402 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
-/* For data server errors we don't recover from */
-static void
-filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-	if (lseg->pls_range.iomode == IOMODE_RW) {
-		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-	} else {
-		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-	}
-}
-
 static int filelayout_async_handle_error(struct rpc_task *task,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
@@ -135,7 +122,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 static int filelayout_read_done_cb(struct rpc_task *task,
 				struct nfs_read_data *data)
 {
-	struct nfs_client *clp = data->ds_clp;
 	int reset = 0;
 
 	dprintk("%s DS read\n", __func__);
@@ -145,11 +131,10 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_read(task, data);
-			clp = NFS_SERVER(data->inode)->nfs_client;
 		}
-		nfs_restart_rpc(task, clp);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -216,17 +201,13 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 
 	if (filelayout_async_handle_error(task, data->args.context->state,
 					  data->ds_clp, &reset) == -EAGAIN) {
-		struct nfs_client *clp;
-
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_write(task, data);
-			clp = NFS_SERVER(data->inode)->nfs_client;
-		} else
-			clp = data->ds_clp;
-		nfs_restart_rpc(task, clp);
+		}
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -256,9 +237,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
 			prepare_to_resend_writes(data);
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 		} else
-			nfs_restart_rpc(task, data->ds_clp);
+			rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4700fae1ada..d2ae413c986 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
-			     const struct qstr *name, struct nfs_fh *fhandle,
-			     struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
@@ -753,9 +750,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 
 	spin_lock(&dir->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-	if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+	if (!cinfo->atomic || cinfo->before != dir->i_version)
 		nfs_force_lookup_revalidate(dir);
-	nfsi->change_attr = cinfo->after;
+	dir->i_version = cinfo->after;
 	spin_unlock(&dir->i_lock);
 }
 
@@ -1596,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	int status;
 
 	status = nfs4_run_open_task(data, 0);
-	if (status != 0 || !data->rpc_done)
+	if (!data->rpc_done)
+		return status;
+	if (status != 0) {
+		if (status == -NFS4ERR_BADNAME &&
+				!(o_arg->open_flags & O_CREAT))
+			return -ENOENT;
 		return status;
+	}
 
 	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
@@ -2408,14 +2411,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	return status;
 }
 
-static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
-		const struct nfs_fh *dirfh, const struct qstr *name,
-		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+		const struct qstr *name, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	int		       status;
 	struct nfs4_lookup_arg args = {
 		.bitmask = server->attr_bitmask,
-		.dir_fh = dirfh,
+		.dir_fh = NFS_FH(dir),
 		.name = name,
 	};
 	struct nfs4_lookup_res res = {
@@ -2431,40 +2435,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
 
 	nfs_fattr_init(fattr);
 
-	dprintk("NFS call  lookupfh %s\n", name->name);
-	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
-	dprintk("NFS reply lookupfh: %d\n", status);
-	return status;
-}
-
-static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
-			      struct qstr *name, struct nfs_fh *fhandle,
-			      struct nfs_fattr *fattr)
-{
-	struct nfs4_exception exception = { };
-	int err;
-	do {
-		err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
-		/* FIXME: !!!! */
-		if (err == -NFS4ERR_MOVED) {
-			err = -EREMOTE;
-			break;
-		}
-		err = nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
-	return err;
-}
-
-static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
-		const struct qstr *name, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
-{
-	int status;
-	
 	dprintk("NFS call  lookup %s\n", name->name);
-	status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
-	if (status == -NFS4ERR_MOVED)
-		status = nfs4_get_referral(dir, name, fattr, fhandle);
+	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
@@ -2485,11 +2457,20 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
 	struct nfs4_exception exception = { };
 	int err;
 	do {
-		err = nfs4_handle_exception(NFS_SERVER(dir),
-				_nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
-				&exception);
-		if (err == -EPERM)
+		int status;
+
+		status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
+		switch (status) {
+		case -NFS4ERR_BADNAME:
+			return -ENOENT;
+		case -NFS4ERR_MOVED:
+			err = nfs4_get_referral(dir, name, fattr, fhandle);
+			break;
+		case -NFS4ERR_WRONGSEC:
 			nfs_fixup_secinfo_attributes(fattr, fhandle);
+		}
+		err = nfs4_handle_exception(NFS_SERVER(dir),
+				status, &exception);
 	} while (exception.retry);
 	return err;
 }
@@ -3210,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-		nfs_restart_rpc(task, server->nfs_client);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -3260,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 	if (task->tk_status >= 0) {
@@ -3317,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
 	struct inode *inode = data->inode;
 
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 	nfs_refresh_inode(inode, data->res.fattr);
@@ -3857,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	default:
 		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
 				-EAGAIN) {
-			nfs_restart_rpc(task, data->res.server->nfs_client);
+			rpc_restart_call_prepare(task);
 			return;
 		}
 	}
@@ -4111,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 			break;
 		default:
 			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-				nfs_restart_rpc(task,
-						 calldata->server->nfs_client);
+				rpc_restart_call_prepare(task);
 	}
 }
 
@@ -4945,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 		task->tk_status = 0;
 		/* fall through */
 	case -NFS4ERR_RETRY_UNCACHED_REP:
-		nfs_restart_rpc(task, data->clp);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 	dprintk("<-- %s\n", __func__);
@@ -5786,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 
 	server = NFS_SERVER(lrp->args.inode);
 	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
-		nfs_restart_rpc(task, lrp->clp);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 	spin_lock(&lo->plh_inode->i_lock);
@@ -5957,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 	}
 
 	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
-		nfs_restart_rpc(task, server->nfs_client);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 
@@ -6270,7 +6250,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.getroot	= nfs4_proc_get_root,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
-	.lookupfh	= nfs4_proc_lookupfh,
 	.lookup		= nfs4_proc_lookup,
 	.access		= nfs4_proc_access,
 	.readlink	= nfs4_proc_readlink,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e8836c3..ee73d9a4f70 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1168,23 +1168,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 /*
  * Called by non rpc-based layout drivers
  */
-int
-pnfs_ld_write_done(struct nfs_write_data *data)
+void pnfs_ld_write_done(struct nfs_write_data *data)
 {
-	int status;
-
-	if (!data->pnfs_error) {
+	if (likely(!data->pnfs_error)) {
 		pnfs_set_layoutcommit(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
-		data->mds_ops->rpc_release(data);
-		return 0;
+	} else {
+		put_lseg(data->lseg);
+		data->lseg = NULL;
+		dprintk("pnfs write error = %d\n", data->pnfs_error);
 	}
-
-	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-		data->pnfs_error);
-	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
-				    data->mds_ops, NFS_FILE_SYNC);
-	return status ? : -EAGAIN;
+	data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
@@ -1268,23 +1262,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 /*
  * Called by non rpc-based layout drivers
  */
-int
-pnfs_ld_read_done(struct nfs_read_data *data)
+void pnfs_ld_read_done(struct nfs_read_data *data)
 {
-	int status;
-
-	if (!data->pnfs_error) {
+	if (likely(!data->pnfs_error)) {
 		__nfs4_read_done_cb(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
-		data->mds_ops->rpc_release(data);
-		return 0;
+	} else {
+		put_lseg(data->lseg);
+		data->lseg = NULL;
+		dprintk("pnfs write error = %d\n", data->pnfs_error);
 	}
-
-	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-		data->pnfs_error);
-	status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
-				   data->mds_ops);
-	return status ? : -EAGAIN;
+	data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
@@ -1381,6 +1369,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 	}
 }
 
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+	} else {
+		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
 void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd54f3c..1509530cb11 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
-int pnfs_ld_write_done(struct nfs_write_data *);
-int pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_write_done(struct nfs_write_data *);
+void pnfs_ld_read_done(struct nfs_read_data *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       struct nfs_open_context *ctx,
 					       loff_t pos,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c043ab0..8b48ec63f72 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
-static mempool_t *nfs_rdata_mempool;
-
-#define MIN_POOL_READ	(32)
 
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
+	struct nfs_read_data *p;
 
+	p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 	if (p) {
-		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		else {
 			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
 			if (!p->pagevec) {
-				mempool_free(p, nfs_rdata_mempool);
+				kmem_cache_free(nfs_rdata_cachep, p);
 				p = NULL;
 			}
 		}
@@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
-	mempool_free(p, nfs_rdata_mempool);
+	kmem_cache_free(nfs_rdata_cachep, p);
 }
 
 void nfs_readdata_release(struct nfs_read_data *rdata)
@@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head)
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		SetPageError(req->wb_page);
 		nfs_readpage_release(req);
 	}
 }
@@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
 		offset += len;
 	} while(nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
-	ClearPageError(page);
 	desc->pg_rpc_callops = &nfs_read_partial_ops;
 	return ret;
 out_bad:
@@ -331,7 +326,6 @@ out_bad:
 		list_del(&data->list);
 		nfs_readdata_free(data);
 	}
-	SetPageError(page);
 	nfs_readpage_release(req);
 	return -ENOMEM;
 }
@@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
-		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
@@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
-	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+	rpc_restart_call_prepare(task);
 }
 
 /*
@@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata)
 	int status = data->task.tk_status;
 
 	if (status < 0)
-		SetPageError(page);
+		set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
 
 	if (atomic_dec_and_test(&req->wb_complete)) {
-		if (!PageError(page))
+		if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
 			SetPageUptodate(page);
 		nfs_readpage_release(req);
 	}
@@ -541,13 +534,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_full(void *calldata)
 {
 	struct nfs_read_data *data = calldata;
+	struct nfs_pageio_descriptor pgio;
 
+	if (data->pnfs_error) {
+		nfs_pageio_init_read_mds(&pgio, data->inode);
+		pgio.pg_recoalesce = 1;
+	}
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
 
 		nfs_list_remove_request(req);
-		nfs_readpage_release(req);
+		if (!data->pnfs_error)
+			nfs_readpage_release(req);
+		else
+			nfs_pageio_add_request(&pgio, req);
 	}
+	if (data->pnfs_error)
+		nfs_pageio_complete(&pgio);
 	nfs_readdata_release(calldata);
 }
 
@@ -648,7 +651,6 @@ readpage_async_filler(void *data, struct page *page)
 	return 0;
 out_error:
 	error = PTR_ERR(new);
-	SetPageError(page);
 out_unlock:
 	unlock_page(page);
 	return error;
@@ -711,16 +713,10 @@ int __init nfs_init_readpagecache(void)
 	if (nfs_rdata_cachep == NULL)
 		return -ENOMEM;
 
-	nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
-						     nfs_rdata_cachep);
-	if (nfs_rdata_mempool == NULL)
-		return -ENOMEM;
-
 	return 0;
 }
 
 void nfs_destroy_readpagecache(void)
 {
-	mempool_destroy(nfs_rdata_mempool);
 	kmem_cache_destroy(nfs_rdata_cachep);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5b19b6aabe1..480b3b6bf71 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	return 0;
 }
+
+#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
-void show_sessions(struct seq_file *m, struct nfs_server *server)
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
 {
 	if (nfs4_has_session(server->nfs_client))
 		seq_printf(m, ",sessions");
 }
 #else
-void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
 #endif
 
+#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
-void show_pnfs(struct seq_file *m, struct nfs_server *server)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
 {
 	seq_printf(m, ",pnfs=");
 	if (server->pnfs_curr_ld)
@@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server)
 	else
 		seq_printf(m, "not configured");
 }
-#else  /* CONFIG_NFS_V4_1 */
-void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
-#endif /* CONFIG_NFS_V4_1 */
+#else
+static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#endif
 
 static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
 {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b2fbbde58e4..4f9319a2e56 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 	struct inode *dir = data->dir;
 
 	if (!NFS_PROTO(dir)->unlink_done(task, dir))
-		nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+		rpc_restart_call_prepare(task);
 }
 
 /**
@@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
 	struct dentry *new_dentry = data->new_dentry;
 
 	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
-		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+		rpc_restart_call_prepare(task);
 		return;
 	}
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c9bd2a6b7d4..2219c88d96b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,7 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
 	BUG_ON(error);
 	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
-		nfsi->change_attr++;
+		inode->i_version++;
 	set_bit(PG_MAPPED, &req->wb_flags);
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
@@ -428,7 +428,6 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
 	__set_page_dirty_nobuffers(req->wb_page);
-	__mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page,
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
 		nfs_set_pageerror(page);
+	else
+		__set_page_dirty_nobuffers(page);
 
 	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
@@ -1010,7 +1011,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
-		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
@@ -1165,7 +1165,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	int status = data->task.tk_status;
+	int ret, status = data->task.tk_status;
+	struct nfs_pageio_descriptor pgio;
+
+	if (data->pnfs_error) {
+		nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
+		pgio.pg_recoalesce = 1;
+	}
 
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
@@ -1181,6 +1187,11 @@ static void nfs_writeback_release_full(void *calldata)
 			req->wb_bytes,
 			(long long)req_offset(req));
 
+		if (data->pnfs_error) {
+			dprintk(", pnfs error = %d\n", data->pnfs_error);
+			goto next;
+		}
+
 		if (status < 0) {
 			nfs_set_pageerror(page);
 			nfs_context_set_write_error(req->wb_context, status);
@@ -1200,7 +1211,19 @@ remove_request:
 	next:
 		nfs_clear_page_tag_locked(req);
 		nfs_end_page_writeback(page);
+		if (data->pnfs_error) {
+			lock_page(page);
+			nfs_pageio_cond_complete(&pgio, page->index);
+			ret = nfs_page_async_flush(&pgio, page, 0);
+			if (ret) {
+				nfs_set_pageerror(page);
+				dprintk("rewrite to MDS error = %d\n", ret);
+			}
+			unlock_page(page);
+		}
 	}
+	if (data->pnfs_error)
+		nfs_pageio_complete(&pgio);
 	nfs_writedata_release(calldata);
 }
 
@@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 				 */
 				argp->stable = NFS_FILE_SYNC;
 			}
-			nfs_restart_rpc(task, server->nfs_client);
+			rpc_restart_call_prepare(task);
 			return;
 		}
 		if (time_before(complain, jiffies)) {
@@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	int flags = FLUSH_SYNC;
 	int ret = 0;
 
+	/* no commits means nothing needs to be done */
+	if (!nfsi->ncommit)
+		return ret;
+
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		/* Don't commit yet if this is a non-blocking flush and there
 		 * are a lot of outstanding writes for this mapping.
@@ -1686,34 +1713,20 @@ out_error:
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 		struct page *page)
 {
-	struct nfs_page *req;
-	int ret;
+	/*
+	 * If PagePrivate is set, then the page is currently associated with
+	 * an in-progress read or write request. Don't try to migrate it.
+	 *
+	 * FIXME: we could do this in principle, but we'll need a way to ensure
+	 *        that we can safely release the inode reference while holding
+	 *        the page lock.
+	 */
+	if (PagePrivate(page))
+		return -EBUSY;
 
 	nfs_fscache_release_page(page, GFP_KERNEL);
 
-	req = nfs_find_and_lock_request(page, false);
-	ret = PTR_ERR(req);
-	if (IS_ERR(req))
-		goto out;
-
-	ret = migrate_page(mapping, newpage, page);
-	if (!req)
-		goto out;
-	if (ret)
-		goto out_unlock;
-	page_cache_get(newpage);
-	spin_lock(&mapping->host->i_lock);
-	req->wb_page = newpage;
-	SetPagePrivate(newpage);
-	set_page_private(newpage, (unsigned long)req);
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	spin_unlock(&mapping->host->i_lock);
-	page_cache_release(page);
-out_unlock:
-	nfs_clear_page_tag_locked(req);
-out:
-	return ret;
+	return migrate_page(mapping, newpage, page);
 }
 #endif
 
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc5..62f3b9074e8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/exportfs.h>
 
-#include <linux/nfsd/syscall.h>
 #include <net/ipv6.h>
 
 #include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
 	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
 	path_put(&exp->ex_path);
 	auth_domain_put(exp->ex_client);
-	kfree(exp->ex_pathname);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
 	kfree(exp);
 }
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_client = dom;
 
-	err = -ENOMEM;
-	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
-	if (!exp.ex_pathname)
-		goto out2;
-
 	/* expiry */
 	err = -EINVAL;
 	exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
 	nfsd4_fslocs_free(&exp.ex_fslocs);
 	kfree(exp.ex_uuid);
 out3:
-	kfree(exp.ex_pathname);
-out2:
 	path_put(&exp.ex_path);
 out1:
 	auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_client = item->ex_client;
 	new->ex_path.dentry = dget(item->ex_path.dentry);
 	new->ex_path.mnt = mntget(item->ex_path.mnt);
-	new->ex_pathname = NULL;
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fsid = item->ex_fsid;
 	new->ex_uuid = item->ex_uuid;
 	item->ex_uuid = NULL;
-	new->ex_pathname = item->ex_pathname;
-	item->ex_pathname = NULL;
 	new->ex_fslocs.locations = item->ex_fslocs.locations;
 	item->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
 	return exp;
 }
 
-static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
 {
 	u32 fsidv[2];
 
@@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	struct svc_export *exp;
 	__be32 rv;
 
-	exp = find_fsidzero_export(rqstp);
+	exp = rqst_find_fsidzero_export(rqstp);
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 	rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ec..7748d6a18d9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
 
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
 	__be32 *p;
 
 	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
-	encode_stateid4(xdr, &dp->dl_stateid);
+	encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
 
 	p = xdr_reserve_space(xdr, 4);
 	*p++ = xdr_zero;			/* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	 */
 	status = 0;
 out:
+	if (status)
+		nfsd4_mark_cb_fault(cb->cb_clp, status);
 	return status;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
 	warn_no_callback_path(clp, reason);
 }
 
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+	clp->cl_cb_state = NFSD4_CB_FAULT;
+	warn_no_callback_path(clp, reason);
+}
+
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 	u32 minorversion = clp->cl_minorversion;
 
 	cb->cb_minorversion = minorversion;
@@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 
 	dprintk("%s: minorversion=%d\n", __func__,
 		clp->cl_minorversion);
@@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
 	nfsd4_cb_done(task, calldata);
@@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfsd4_callback *cb = &dp->dl_recall;
-	struct nfs4_client *clp = dp->dl_client;
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
 
 	dp->dl_retries = 1;
 	cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e8077766661..fa383361bc6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 
+#include "idmap.h"
 #include "cache.h"
 #include "xdr4.h"
 #include "vfs.h"
@@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 		!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
 		return nfserr_inval;
 
+	accmode |= NFSD_MAY_READ_IF_EXEC;
+
 	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
 		accmode |= NFSD_MAY_READ;
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
@@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 	return status;
 }
 
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+	umode_t mode = fh->fh_dentry->d_inode->i_mode;
+
+	if (S_ISREG(mode))
+		return nfs_ok;
+	if (S_ISDIR(mode))
+		return nfserr_isdir;
+	/*
+	 * Using err_symlink as our catch-all case may look odd; but
+	 * there's no other obvious error for this case in 4.0, and we
+	 * happen to know that it will cause the linux v4 client to do
+	 * the right thing on attempts to open something other than a
+	 * regular file.
+	 */
+	return nfserr_symlink;
+}
+
 static __be32
 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
 	struct svc_fh resfh;
 	__be32 status;
-	int created = 0;
 
 	fh_init(&resfh, NFS4_FHSIZE);
 	open->op_truncate = 0;
@@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 					open->op_fname.len, &open->op_iattr,
 					&resfh, open->op_createmode,
 					(u32 *)open->op_verf.data,
-					&open->op_truncate, &created);
+					&open->op_truncate, &open->op_created);
 
 		/*
 		 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 		status = nfsd_lookup(rqstp, current_fh,
 				     open->op_fname.data, open->op_fname.len, &resfh);
 		fh_unlock(current_fh);
+		if (status)
+			goto out;
+		status = nfsd_check_obj_isreg(&resfh);
 	}
 	if (status)
 		goto out;
@@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	fh_dup2(current_fh, &resfh);
 
 	/* set reply cache */
-	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
 			&resfh.fh_handle);
-	if (!created)
+	if (!open->op_created)
 		status = do_open_permission(rqstp, current_fh, open,
 					    NFSD_MAY_NOP);
 
@@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
 	/* set replay cache */
-	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
 			&current_fh->fh_handle);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 	struct nfsd4_compoundres *resp;
 
-	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
-		open->op_stateowner);
+		open->op_openowner);
 
 	/* This check required by spec. */
 	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
 		return nfserr_inval;
 
+	/* We don't yet support WANT bits: */
+	open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
+
+	open->op_created = 0;
 	/*
 	 * RFC5661 18.51.3
 	 * Before RECLAIM_COMPLETE done, server should deny new lock
@@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	resp = rqstp->rq_resp;
 	status = nfsd4_process_open1(&resp->cstate, open);
 	if (status == nfserr_replay_me) {
-		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+		struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
 		fh_put(&cstate->current_fh);
 		fh_copy_shallow(&cstate->current_fh.fh_handle,
 				&rp->rp_openfh);
@@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 		case NFS4_OPEN_CLAIM_NULL:
-			/*
-			 * (1) set CURRENT_FH to the file being opened,
-			 * creating it if necessary, (2) set open->op_cinfo,
-			 * (3) set open->op_truncate if the file is to be
-			 * truncated after opening, (4) do permission checking.
-			 */
 			status = do_open_lookup(rqstp, &cstate->current_fh,
 						open);
 			if (status)
 				goto out;
 			break;
 		case NFS4_OPEN_CLAIM_PREVIOUS:
-			open->op_stateowner->so_confirmed = 1;
-			/*
-			 * The CURRENT_FH is already set to the file being
-			 * opened.  (1) set open->op_cinfo, (2) set
-			 * open->op_truncate if the file is to be truncated
-			 * after opening, (3) do permission checking.
-			*/
+			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+		case NFS4_OPEN_CLAIM_FH:
+		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
 			status = do_open_fhandle(rqstp, &cstate->current_fh,
 						 open);
 			if (status)
 				goto out;
 			break;
+		case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
              	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
-			open->op_stateowner->so_confirmed = 1;
+			open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 			dprintk("NFSD: unsupported OPEN claim type %d\n",
 				open->op_claim_type);
 			status = nfserr_notsupp;
@@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
 	 */
 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+	WARN_ON(status && open->op_created);
 out:
-	if (open->op_stateowner) {
-		nfs4_get_stateowner(open->op_stateowner);
-		cstate->replay_owner = open->op_stateowner;
-	}
-	nfs4_unlock_state();
+	nfsd4_cleanup_open_state(open, status);
+	if (open->op_openowner)
+		cstate->replay_owner = &open->op_openowner->oo_owner;
+	else
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -467,17 +486,12 @@ static __be32
 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     struct nfsd4_commit *commit)
 {
-	__be32 status;
-
 	u32 *p = (u32 *)commit->co_verf.data;
 	*p++ = nfssvc_boot.tv_sec;
 	*p++ = nfssvc_boot.tv_usec;
 
-	status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+	return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
 			     commit->co_count);
-	if (status == nfserr_symlink)
-		status = nfserr_inval;
-	return status;
 }
 
 static __be32
@@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
 			   NFSD_MAY_CREATE);
-	if (status == nfserr_symlink)
-		status = nfserr_notdir;
 	if (status)
 		return status;
 
@@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
 	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
-	if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
+	if ((cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
 		return nfserr_bad_cookie;
 
@@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return nfserr_grace;
 	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
 			     remove->rm_name, remove->rm_namelen);
-	if (status == nfserr_symlink)
-		return nfserr_notdir;
 	if (!status) {
 		fh_unlock(&cstate->current_fh);
 		set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
                    S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
 		status = nfserr_exist;
-	else if (status == nfserr_symlink)
-		status = nfserr_notdir;
 
 	if (!status) {
 		set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	write->wr_bytes_written = cnt;
 
-	if (status == nfserr_symlink)
-		status = nfserr_inval;
 	return status;
 }
 
@@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	count = 4 + (verify->ve_attrlen >> 2);
 	buf = kmalloc(count << 2, GFP_KERNEL);
 	if (!buf)
-		return nfserr_resource;
+		return nfserr_jukebox;
 
 	status = nfsd4_encode_fattr(&cstate->current_fh,
 				    cstate->current_fh.fh_export,
@@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
+
 enum nfsd4_op_flags {
 	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
 	ALLOWED_ON_ABSENT_FS = 1 << 1,	/* ops processed on absent fs */
@@ -1001,13 +1009,15 @@ enum nfsd4_op_flags {
 	/* For rfc 5661 section 2.6.3.1.1: */
 	OP_HANDLES_WRONGSEC = 1 << 3,
 	OP_IS_PUTFH_LIKE = 1 << 4,
-};
-
-struct nfsd4_operation {
-	nfsd4op_func op_func;
-	u32 op_flags;
-	char *op_name;
 	/*
+	 * These are the ops whose result size we estimate before
+	 * encoding, to avoid performing an op then not being able to
+	 * respond or cache a response.  This includes writes and setattrs
+	 * as well as the operations usually called "nonidempotent":
+	 */
+	OP_MODIFIES_SOMETHING = 1 << 5,
+	/*
+	 * Cache compounds containing these ops in the xid-based drc:
 	 * We use the DRC for compounds containing non-idempotent
 	 * operations, *except* those that are 4.1-specific (since
 	 * sessions provide their own EOS), and except for stateful
@@ -1015,7 +1025,15 @@ struct nfsd4_operation {
 	 * (since sequence numbers provide EOS for open, lock, etc in
 	 * the v4.0 case).
 	 */
-	bool op_cacheresult;
+	OP_CACHEME = 1 << 6,
+};
+
+struct nfsd4_operation {
+	nfsd4op_func op_func;
+	u32 op_flags;
+	char *op_name;
+	/* Try to get response size before operation */
+	nfsd4op_rsize op_rsize_bop;
 };
 
 static struct nfsd4_operation nfsd4_ops[];
@@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
 
 bool nfsd4_cache_this_op(struct nfsd4_op *op)
 {
-	return OPDESC(op)->op_cacheresult;
+	return OPDESC(op)->op_flags & OP_CACHEME;
 }
 
 static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	struct nfsd4_operation *opdesc;
 	struct nfsd4_compound_state *cstate = &resp->cstate;
 	int		slack_bytes;
+	u32		plen = 0;
 	__be32		status;
 
 	resp->xbuf = &rqstp->rq_res;
@@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			goto encode_op;
 		}
 
+		/* If op is non-idempotent */
+		if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+			plen = opdesc->op_rsize_bop(rqstp, op);
+			op->status = nfsd4_check_resp_size(resp, plen);
+		}
+
+		if (op->status)
+			goto encode_op;
+
 		if (opdesc->op_func)
 			op->status = opdesc->op_func(rqstp, cstate, &op->u);
 		else
@@ -1217,7 +1245,7 @@ encode_op:
 			be32_to_cpu(status));
 
 		if (cstate->replay_owner) {
-			nfs4_put_stateowner(cstate->replay_owner);
+			nfs4_unlock_state();
 			cstate->replay_owner = NULL;
 		}
 		/* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1238,6 +1266,144 @@ out:
 	return status;
 }
 
+#define op_encode_hdr_size		(2)
+#define op_encode_stateid_maxsz		(XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz	(XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz	(5)
+#define nfs4_fattr_bitmap_maxsz		(4)
+
+#define op_encode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_lock_denied_maxsz	(8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz		(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz		(3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz	(1 + op_encode_stateid_maxsz + 1 + \
+					 op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz	(6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz
+		+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_stateid_maxsz
+		+ op_encode_change_info_maxsz + 1
+		+ nfs4_fattr_bitmap_maxsz
+		+ op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 maxcount = 0, rlen = 0;
+
+	maxcount = svc_max_payload(rqstp);
+	rlen = op->u.read.rd_length;
+
+	if (rlen > maxcount)
+		rlen = maxcount;
+
+	return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 rlen = op->u.readdir.rd_maxcount;
+
+	if (rlen > PAGE_SIZE)
+		rlen = PAGE_SIZE;
+
+	return (op_encode_hdr_size + op_encode_verifier_maxsz)
+		 * sizeof(__be32) + rlen;
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz)
+		* sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_change_info_maxsz
+		+ op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+		1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+		2 + /*eir_server_owner.so_minor_id */\
+		/* eir_server_owner.so_major_id<> */\
+		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+		/* eir_server_scope<> */\
+		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+		1 + /* eir_server_impl_id array length */\
+		0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + \
+		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+		2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size + \
+		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+		2 + /* csr_sequence, csr_flags */\
+		op_encode_channel_attrs_maxsz + \
+		op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
@@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_CLOSE] = {
 		.op_func = (nfsd4op_func)nfsd4_close,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_CLOSE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_COMMIT] = {
 		.op_func = (nfsd4op_func)nfsd4_commit,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_COMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
 	},
 	[OP_CREATE] = {
 		.op_func = (nfsd4op_func)nfsd4_create,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_CREATE",
-		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
 	},
 	[OP_DELEGRETURN] = {
 		.op_func = (nfsd4op_func)nfsd4_delegreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DELEGRETURN",
+		.op_rsize_bop = nfsd4_only_status_rsize,
 	},
 	[OP_GETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LINK] = {
 		.op_func = (nfsd4op_func)nfsd4_link,
+		.op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+				| OP_CACHEME,
 		.op_name = "OP_LINK",
-		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
 	},
 	[OP_LOCK] = {
 		.op_func = (nfsd4op_func)nfsd4_lock,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_LOCK",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
 	},
 	[OP_LOCKT] = {
 		.op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOCKU] = {
 		.op_func = (nfsd4op_func)nfsd4_locku,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_LOCKU",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_LOOKUP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_OPEN] = {
 		.op_func = (nfsd4op_func)nfsd4_open,
-		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_OPEN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
 	},
 	[OP_OPEN_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_open_confirm,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_OPEN_CONFIRM",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_OPEN_DOWNGRADE] = {
 		.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_OPEN_DOWNGRADE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
 	},
 	[OP_PUTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_PUTFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_PUTPUBFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_PUTPUBFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_PUTROOTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_PUTROOTFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_READ] = {
 		.op_func = (nfsd4op_func)nfsd4_read,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_READ",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
 	},
 	[OP_READDIR] = {
 		.op_func = (nfsd4op_func)nfsd4_readdir,
+		.op_flags = OP_MODIFIES_SOMETHING,
 		.op_name = "OP_READDIR",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
 	},
 	[OP_READLINK] = {
 		.op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_REMOVE] = {
 		.op_func = (nfsd4op_func)nfsd4_remove,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_REMOVE",
-		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
 	},
 	[OP_RENAME] = {
-		.op_name = "OP_RENAME",
 		.op_func = (nfsd4op_func)nfsd4_rename,
-		.op_cacheresult = true,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_RENAME",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
 	},
 	[OP_RENEW] = {
 		.op_func = (nfsd4op_func)nfsd4_renew,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RENEW",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+
 	},
 	[OP_RESTOREFH] = {
 		.op_func = (nfsd4op_func)nfsd4_restorefh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-				| OP_IS_PUTFH_LIKE,
+				| OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RESTOREFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SAVEFH] = {
 		.op_func = (nfsd4op_func)nfsd4_savefh,
-		.op_flags = OP_HANDLES_WRONGSEC,
+		.op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_SAVEFH",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SECINFO] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
 		.op_name = "OP_SETATTR",
-		.op_cacheresult = true,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
 	},
 	[OP_SETCLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_SETCLIENTID",
-		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
 	},
 	[OP_SETCLIENTID_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_SETCLIENTID_CONFIRM",
-		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_VERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_verify,
@@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_WRITE] = {
 		.op_func = (nfsd4op_func)nfsd4_write,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
 		.op_name = "OP_WRITE",
-		.op_cacheresult = true,
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
 	},
 	[OP_RELEASE_LOCKOWNER] = {
 		.op_func = (nfsd4op_func)nfsd4_release_lockowner,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RELEASE_LOCKOWNER",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 
 	/* NFSv4.1 operations */
 	[OP_EXCHANGE_ID] = {
 		.op_func = (nfsd4op_func)nfsd4_exchange_id,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_EXCHANGE_ID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
 	},
 	[OP_BIND_CONN_TO_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_BIND_CONN_TO_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
 	},
 	[OP_CREATE_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_create_session,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_CREATE_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
 	},
 	[OP_DESTROY_SESSION] = {
 		.op_func = (nfsd4op_func)nfsd4_destroy_session,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DESTROY_SESSION",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SEQUENCE] = {
 		.op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_name = "OP_SEQUENCE",
 	},
 	[OP_DESTROY_CLIENTID] = {
-		.op_func = NULL,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+				| OP_MODIFIES_SOMETHING,
 		.op_name = "OP_DESTROY_CLIENTID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_RECLAIM_COMPLETE] = {
 		.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
-		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_RECLAIM_COMPLETE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 	[OP_SECINFO_NO_NAME] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_FREE_STATEID] = {
 		.op_func = (nfsd4op_func)nfsd4_free_stateid,
-		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
 		.op_name = "OP_FREE_STATEID",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
 };
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585..ed083b9a731 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
 
 /* Globals */
 static struct file *rec_file;
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 	struct xdr_netobj cksum;
 	struct hash_desc desc;
 	struct scatterlist sg;
-	__be32 status = nfserr_resource;
+	__be32 status = nfserr_jukebox;
 
 	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
 			clname->len, clname->data);
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	if (!rec_file || clp->cl_firststate)
 		return 0;
 
+	clp->cl_firststate = 1;
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		goto out_unlock;
 	}
 	status = -EEXIST;
-	if (dentry->d_inode) {
-		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+	if (dentry->d_inode)
 		goto out_put;
-	}
 	status = mnt_want_write(rec_file->f_path.mnt);
 	if (status)
 		goto out_put;
@@ -156,12 +156,14 @@ out_put:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
-	if (status == 0) {
-		clp->cl_firststate = 1;
+	if (status == 0)
 		vfs_fsync(rec_file, 0);
-	}
+	else
+		printk(KERN_ERR "NFSD: failed to write recovery record"
+				" (err %d); please check that %s exists"
+				" and is writeable", status,
+				user_recovery_dirname);
 	nfs4_reset_creds(original_cred);
-	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
 	return status;
 }
 
@@ -354,13 +356,13 @@ nfsd4_recdir_load(void) {
  */
 
 void
-nfsd4_init_recdir(char *rec_dirname)
+nfsd4_init_recdir()
 {
 	const struct cred *original_cred;
 	int status;
 
 	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
-			rec_dirname);
+			user_recovery_dirname);
 
 	BUG_ON(rec_file);
 
@@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
 		return;
 	}
 
-	rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+	rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
 	if (IS_ERR(rec_file)) {
 		printk("NFSD: unable to find recovery directory %s\n",
-				rec_dirname);
+				user_recovery_dirname);
 		rec_file = NULL;
 	}
 
@@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void)
 	fput(rec_file);
 	rec_file = NULL;
 }
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+	int status;
+	struct path path;
+
+	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+	if (status)
+		return status;
+	status = -ENOTDIR;
+	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+		strcpy(user_recovery_dirname, recdir);
+		status = 0;
+	}
+	path_put(&path);
+	return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+	return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec11740..47e94e33a97 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,6 @@
 time_t nfsd4_lease = 90;     /* default lease time */
 time_t nfsd4_grace = 90;
 static time_t boot_time;
-static u32 current_ownerid = 1;
-static u32 current_fileid = 1;
-static u32 current_delegid = 1;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -60,13 +57,7 @@ static u64 current_sessionid = 1;
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
 
 /* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
-static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
-static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static void nfs4_set_recdir(char *recdir);
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
+static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
 
 /* Locking: */
 
@@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
  */
 static DEFINE_SPINLOCK(recall_lock);
 
-static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *openowner_slab = NULL;
+static struct kmem_cache *lockowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
 static struct kmem_cache *deleg_slab = NULL;
@@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
 
 static struct list_head del_recall_lru;
 
+static void nfsd4_free_file(struct nfs4_file *f)
+{
+	kmem_cache_free(file_slab, f);
+}
+
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
@@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
 		list_del(&fi->fi_hash);
 		spin_unlock(&recall_lock);
 		iput(fi->fi_inode);
-		kmem_cache_free(file_slab, fi);
+		nfsd4_free_file(fi);
 	}
 }
 
@@ -136,35 +133,33 @@ unsigned int max_delegations;
  * Open owner state (share locks)
  */
 
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS              8
-#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+/* hash tables for open owners */
+#define OPEN_OWNER_HASH_BITS              8
+#define OPEN_OWNER_HASH_SIZE             (1 << OPEN_OWNER_HASH_BITS)
+#define OPEN_OWNER_HASH_MASK             (OPEN_OWNER_HASH_SIZE - 1)
 
-#define ownerid_hashval(id) \
-        ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
-        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+{
+	unsigned int ret;
 
-static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
+	ret = opaque_hashval(ownername->data, ownername->len);
+	ret += clientid;
+	return ret & OPEN_OWNER_HASH_MASK;
+}
+
+static struct list_head	open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS              10
-#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
-        hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id)  \
-        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+static unsigned int file_hashval(struct inode *ino)
+{
+	/* XXX: why are we hashing on inode pointer, anyway? */
+	return hash_ptr(ino, FILE_HASH_BITS);
+}
 
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
 
 static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
 {
@@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
 static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 {
 	if (atomic_dec_and_test(&fp->fi_access[oflag])) {
-		nfs4_file_put_fd(fp, O_RDWR);
 		nfs4_file_put_fd(fp, oflag);
+		/*
+		 * It's also safe to get rid of the RDWR open *if*
+		 * we no longer have need of the other kind of access
+		 * or if we already have the other kind of open:
+		 */
+		if (fp->fi_fds[1-oflag]
+			|| atomic_read(&fp->fi_access[1 - oflag]) == 0)
+			nfs4_file_put_fd(fp, O_RDWR);
 	}
 }
 
@@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 		__nfs4_file_put_access(fp, oflag);
 }
 
+static inline int get_new_stid(struct nfs4_stid *stid)
+{
+	static int min_stateid = 0;
+	struct idr *stateids = &stid->sc_client->cl_stateids;
+	int new_stid;
+	int error;
+
+	error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
+	/*
+	 * Note: the necessary preallocation was done in
+	 * nfs4_alloc_stateid().  The idr code caps the number of
+	 * preallocations that can exist at a time, but the state lock
+	 * prevents anyone from using ours before we get here:
+	 */
+	BUG_ON(error);
+	/*
+	 * It shouldn't be a problem to reuse an opaque stateid value.
+	 * I don't think it is for 4.1.  But with 4.0 I worry that, for
+	 * example, a stray write retransmission could be accepted by
+	 * the server when it should have been rejected.  Therefore,
+	 * adopt a trick from the sctp code to attempt to maximize the
+	 * amount of time until an id is reused, by ensuring they always
+	 * "increase" (mod INT_MAX):
+	 */
+
+	min_stateid = new_stid+1;
+	if (min_stateid == INT_MAX)
+		min_stateid = 0;
+	return new_stid;
+}
+
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+{
+	stateid_t *s = &stid->sc_stateid;
+	int new_id;
+
+	stid->sc_type = type;
+	stid->sc_client = cl;
+	s->si_opaque.so_clid = cl->cl_clientid;
+	new_id = get_new_stid(stid);
+	s->si_opaque.so_id = (u32)new_id;
+	/* Will be incremented before return to client: */
+	s->si_generation = 0;
+}
+
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
+{
+	struct idr *stateids = &cl->cl_stateids;
+
+	if (!idr_pre_get(stateids, GFP_KERNEL))
+		return NULL;
+	/*
+	 * Note: if we fail here (or any time between now and the time
+	 * we actually get the new idr), we won't need to undo the idr
+	 * preallocation, since the idr code caps the number of
+	 * preallocated entries.
+	 */
+	return kmem_cache_alloc(slab, GFP_KERNEL);
+}
+
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+{
+	return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
+
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
@@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 		return NULL;
 	if (num_delegations > max_delegations)
 		return NULL;
-	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
 	if (dp == NULL)
 		return dp;
+	init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+	/*
+	 * delegation seqid's are never incremented.  The 4.1 special
+	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
+	 * 0 anyway just for consistency and use 1:
+	 */
+	dp->dl_stid.sc_stateid.si_generation = 1;
 	num_delegations++;
 	INIT_LIST_HEAD(&dp->dl_perfile);
 	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
-	dp->dl_client = clp;
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
 	dp->dl_type = type;
-	dp->dl_stateid.si_boot = boot_time;
-	dp->dl_stateid.si_stateownerid = current_delegid++;
-	dp->dl_stateid.si_fileid = 0;
-	dp->dl_stateid.si_generation = 0;
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
@@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 	}
 }
 
+static void unhash_stid(struct nfs4_stid *s)
+{
+	struct idr *stateids = &s->sc_client->cl_stateids;
+
+	idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+}
+
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
+	unhash_stid(&dp->dl_stid);
 	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
 	list_del_init(&dp->dl_perfile);
@@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock);
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
 #define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
 
-#define clientid_hashval(id) \
-	((id) & CLIENT_HASH_MASK)
-#define clientstr_hashval(name) \
-	(opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+static unsigned int clientid_hashval(u32 id)
+{
+	return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(const char *name)
+{
+	return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
+
 /*
  * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
  * used in reboot/reset lease grace period processing
@@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
 }
 
 static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
 	unsigned int access, deny;
 
 	set_access(&access, stp->st_access_bmap);
@@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access)
 	BUG();
 }
 
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-	list_del(&stp->st_hash);
 	list_del(&stp->st_perfile);
 	list_del(&stp->st_perstateowner);
 }
 
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 {
 	int i;
 
@@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
 			if (test_bit(i, &stp->st_access_bmap))
 				nfs4_file_put_access(stp->st_file,
 						nfs4_access_to_omode(i));
+			__clear_bit(i, &stp->st_access_bmap);
 		}
 	}
 	put_nfs4_file(stp->st_file);
+	stp->st_file = NULL;
+}
+
+static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+{
 	kmem_cache_free(stateid_slab, stp);
 }
 
-static void release_lock_stateid(struct nfs4_stateid *stp)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 {
 	struct file *file;
 
 	unhash_generic_stateid(stp);
+	unhash_stid(&stp->st_stid);
 	file = find_any_file(stp->st_file);
 	if (file)
-		locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+		locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
+	close_generic_stateid(stp);
 	free_generic_stateid(stp);
 }
 
-static void unhash_lockowner(struct nfs4_stateowner *sop)
+static void unhash_lockowner(struct nfs4_lockowner *lo)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	list_del(&sop->so_perstateid);
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_first_entry(&sop->so_stateids,
-				struct nfs4_stateid, st_perstateowner);
+	list_del(&lo->lo_owner.so_strhash);
+	list_del(&lo->lo_perstateid);
+	while (!list_empty(&lo->lo_owner.so_stateids)) {
+		stp = list_first_entry(&lo->lo_owner.so_stateids,
+				struct nfs4_ol_stateid, st_perstateowner);
 		release_lock_stateid(stp);
 	}
 }
 
-static void release_lockowner(struct nfs4_stateowner *sop)
+static void release_lockowner(struct nfs4_lockowner *lo)
 {
-	unhash_lockowner(sop);
-	nfs4_put_stateowner(sop);
+	unhash_lockowner(lo);
+	nfs4_free_lockowner(lo);
 }
 
 static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
+release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
 {
-	struct nfs4_stateowner *lock_sop;
+	struct nfs4_lockowner *lo;
 
 	while (!list_empty(&open_stp->st_lockowners)) {
-		lock_sop = list_entry(open_stp->st_lockowners.next,
-				struct nfs4_stateowner, so_perstateid);
-		/* list_del(&open_stp->st_lockowners);  */
-		BUG_ON(lock_sop->so_is_open_owner);
-		release_lockowner(lock_sop);
+		lo = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_lockowner, lo_perstateid);
+		release_lockowner(lo);
 	}
 }
 
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
 {
 	unhash_generic_stateid(stp);
 	release_stateid_lockowners(stp);
+	close_generic_stateid(stp);
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+	unhash_open_stateid(stp);
+	unhash_stid(&stp->st_stid);
 	free_generic_stateid(stp);
 }
 
-static void unhash_openowner(struct nfs4_stateowner *sop)
+static void unhash_openowner(struct nfs4_openowner *oo)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	list_del(&sop->so_perclient);
-	list_del(&sop->so_perstateid); /* XXX: necessary? */
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_first_entry(&sop->so_stateids,
-				struct nfs4_stateid, st_perstateowner);
+	list_del(&oo->oo_owner.so_strhash);
+	list_del(&oo->oo_perclient);
+	while (!list_empty(&oo->oo_owner.so_stateids)) {
+		stp = list_first_entry(&oo->oo_owner.so_stateids,
+				struct nfs4_ol_stateid, st_perstateowner);
 		release_open_stateid(stp);
 	}
 }
 
-static void release_openowner(struct nfs4_stateowner *sop)
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
 {
-	unhash_openowner(sop);
-	list_del(&sop->so_close_lru);
-	nfs4_put_stateowner(sop);
+	struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+
+	if (s) {
+		unhash_stid(&s->st_stid);
+		free_generic_stateid(s);
+		oo->oo_last_closed_stid = NULL;
+	}
+}
+
+static void release_openowner(struct nfs4_openowner *oo)
+{
+	unhash_openowner(oo);
+	list_del(&oo->oo_close_lru);
+	release_last_closed_stateid(oo);
+	nfs4_free_openowner(oo);
 }
 
 #define SESSION_HASH_SIZE	512
@@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp)
 		return;
 	}
 
-	/*
-	* Move client to the end to the LRU list.
-	*/
 	dprintk("renewing client (clientid %08x/%08x)\n", 
 			clp->cl_clientid.cl_boot, 
 			clp->cl_clientid.cl_id);
@@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp)
 static void
 expire_client(struct nfs4_client *clp)
 {
-	struct nfs4_stateowner *sop;
+	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct list_head reaplist;
 
@@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp)
 		unhash_delegation(dp);
 	}
 	while (!list_empty(&clp->cl_openowners)) {
-		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-		release_openowner(sop);
+		oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+		release_openowner(oo);
 	}
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
@@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp)
 	*p++ = i++;
 }
 
+static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
+{
+	return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+}
+
+static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+{
+	struct nfs4_stid *s;
+
+	s = find_stateid(cl, t);
+	if (!s)
+		return NULL;
+	if (typemask & s->sc_type)
+		return s;
+	return NULL;
+}
+
 static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 		}
 	}
 
+	idr_init(&clp->cl_stateids);
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_refcount, 0);
 	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
@@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	return clp;
 }
 
-static int check_name(struct xdr_netobj name)
-{
-	if (name.len == 0) 
-		return 0;
-	if (name.len > NFS4_OPAQUE_LIMIT) {
-		dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
-		return 0;
-	}
-	return 1;
-}
-
 static void
 add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
 {
@@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid)
 	unsigned int idhashval = clientid_hashval(clid->cl_id);
 
 	list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
-		if (same_clid(&clp->cl_clientid, clid))
+		if (same_clid(&clp->cl_clientid, clid)) {
+			renew_client(clp);
 			return clp;
+		}
 	}
 	return NULL;
 }
@@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
 	return NULL;
 }
 
-static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
-{
-	switch (family) {
-	case AF_INET:
-		((struct sockaddr_in *)sa)->sin_family = AF_INET;
-		((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
-		return;
-	case AF_INET6:
-		((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
-		((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
-		return;
-	}
-}
-
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
@@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
 
 	conn->cb_prog = se->se_callback_prog;
 	conn->cb_ident = se->se_callback_ident;
-	rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
+	memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
 	return;
 out_err:
 	conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
 		addr_str, exid->flags, exid->spa_how);
 
-	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+	if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
 		return nfserr_inval;
 
 	/* Currently only support SP4_NONE */
@@ -1849,8 +1945,16 @@ out:
 
 		nfsd4_get_session(cstate->session);
 		atomic_inc(&clp->cl_refcount);
-		if (clp->cl_cb_state == NFSD4_CB_DOWN)
-			seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+		switch (clp->cl_cb_state) {
+		case NFSD4_CB_DOWN:
+			seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+			break;
+		case NFSD4_CB_FAULT:
+			seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+			break;
+		default:
+			seq->status_flags = 0;
+		}
 	}
 	kfree(conn);
 	spin_unlock(&client_lock);
@@ -1858,6 +1962,50 @@ out:
 	return status;
 }
 
+static inline bool has_resources(struct nfs4_client *clp)
+{
+	return !list_empty(&clp->cl_openowners)
+		|| !list_empty(&clp->cl_delegations)
+		|| !list_empty(&clp->cl_sessions);
+}
+
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
+{
+	struct nfs4_client *conf, *unconf, *clp;
+	int status = 0;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&dc->clientid);
+	conf = find_confirmed_client(&dc->clientid);
+
+	if (conf) {
+		clp = conf;
+
+		if (!is_client_expired(conf) && has_resources(conf)) {
+			status = nfserr_clientid_busy;
+			goto out;
+		}
+
+		/* rfc5661 18.50.3 */
+		if (cstate->session && conf == cstate->session->se_client) {
+			status = nfserr_clientid_busy;
+			goto out;
+		}
+	} else if (unconf)
+		clp = unconf;
+	else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	expire_client(clp);
+out:
+	nfs4_unlock_state();
+	dprintk("%s return %d\n", __func__, ntohl(status));
+	return status;
+}
+
 __be32
 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
 {
@@ -1900,19 +2048,13 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
 {
-	struct xdr_netobj 	clname = { 
-		.len = setclid->se_namelen,
-		.data = setclid->se_name,
-	};
+	struct xdr_netobj 	clname = setclid->se_name;
 	nfs4_verifier		clverifier = setclid->se_verf;
 	unsigned int 		strhashval;
 	struct nfs4_client	*conf, *unconf, *new;
 	__be32 			status;
 	char                    dname[HEXDIR_LEN];
 	
-	if (!check_name(clname))
-		return nfserr_inval;
-
 	status = nfs4_make_rec_clidname(dname, &clname);
 	if (status)
 		return status;
@@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
 	unconf = find_unconfirmed_client_by_str(dname, strhashval);
-	status = nfserr_resource;
+	status = nfserr_jukebox;
 	if (!conf) {
 		/*
 		 * RFC 3530 14.2.33 CASE 4:
@@ -2116,31 +2258,28 @@ out:
 	return status;
 }
 
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+	return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
+
 /* OPEN Share state helper functions */
-static inline struct nfs4_file *
-alloc_init_file(struct inode *ino)
+static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
 {
-	struct nfs4_file *fp;
 	unsigned int hashval = file_hashval(ino);
 
-	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
-	if (fp) {
-		atomic_set(&fp->fi_ref, 1);
-		INIT_LIST_HEAD(&fp->fi_hash);
-		INIT_LIST_HEAD(&fp->fi_stateids);
-		INIT_LIST_HEAD(&fp->fi_delegations);
-		fp->fi_inode = igrab(ino);
-		fp->fi_id = current_fileid++;
-		fp->fi_had_conflict = false;
-		fp->fi_lease = NULL;
-		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
-		memset(fp->fi_access, 0, sizeof(fp->fi_access));
-		spin_lock(&recall_lock);
-		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-		spin_unlock(&recall_lock);
-		return fp;
-	}
-	return NULL;
+	atomic_set(&fp->fi_ref, 1);
+	INIT_LIST_HEAD(&fp->fi_hash);
+	INIT_LIST_HEAD(&fp->fi_stateids);
+	INIT_LIST_HEAD(&fp->fi_delegations);
+	fp->fi_inode = igrab(ino);
+	fp->fi_had_conflict = false;
+	fp->fi_lease = NULL;
+	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+	spin_lock(&recall_lock);
+	list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+	spin_unlock(&recall_lock);
 }
 
 static void
@@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
 void
 nfsd4_free_slabs(void)
 {
-	nfsd4_free_slab(&stateowner_slab);
+	nfsd4_free_slab(&openowner_slab);
+	nfsd4_free_slab(&lockowner_slab);
 	nfsd4_free_slab(&file_slab);
 	nfsd4_free_slab(&stateid_slab);
 	nfsd4_free_slab(&deleg_slab);
@@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void)
 static int
 nfsd4_init_slabs(void)
 {
-	stateowner_slab = kmem_cache_create("nfsd4_stateowners",
-			sizeof(struct nfs4_stateowner), 0, 0, NULL);
-	if (stateowner_slab == NULL)
+	openowner_slab = kmem_cache_create("nfsd4_openowners",
+			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	if (openowner_slab == NULL)
+		goto out_nomem;
+	lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	if (lockowner_slab == NULL)
 		goto out_nomem;
 	file_slab = kmem_cache_create("nfsd4_files",
 			sizeof(struct nfs4_file), 0, 0, NULL);
 	if (file_slab == NULL)
 		goto out_nomem;
 	stateid_slab = kmem_cache_create("nfsd4_stateids",
-			sizeof(struct nfs4_stateid), 0, 0, NULL);
+			sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
 	if (stateid_slab == NULL)
 		goto out_nomem;
 	deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2187,97 +2331,94 @@ out_nomem:
 	return -ENOMEM;
 }
 
-void
-nfs4_free_stateowner(struct kref *kref)
+void nfs4_free_openowner(struct nfs4_openowner *oo)
 {
-	struct nfs4_stateowner *sop =
-		container_of(kref, struct nfs4_stateowner, so_ref);
-	kfree(sop->so_owner.data);
-	kmem_cache_free(stateowner_slab, sop);
+	kfree(oo->oo_owner.so_owner.data);
+	kmem_cache_free(openowner_slab, oo);
 }
 
-static inline struct nfs4_stateowner *
-alloc_stateowner(struct xdr_netobj *owner)
+void nfs4_free_lockowner(struct nfs4_lockowner *lo)
 {
-	struct nfs4_stateowner *sop;
+	kfree(lo->lo_owner.so_owner.data);
+	kmem_cache_free(lockowner_slab, lo);
+}
 
-	if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
-		if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
-			memcpy(sop->so_owner.data, owner->data, owner->len);
-			sop->so_owner.len = owner->len;
-			kref_init(&sop->so_ref);
-			return sop;
-		} 
-		kmem_cache_free(stateowner_slab, sop);
-	}
-	return NULL;
+static void init_nfs4_replay(struct nfs4_replay *rp)
+{
+	rp->rp_status = nfserr_serverfault;
+	rp->rp_buflen = 0;
+	rp->rp_buf = rp->rp_ibuf;
 }
 
-static struct nfs4_stateowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
+{
 	struct nfs4_stateowner *sop;
-	struct nfs4_replay *rp;
-	unsigned int idhashval;
 
-	if (!(sop = alloc_stateowner(&open->op_owner)))
+	sop = kmem_cache_alloc(slab, GFP_KERNEL);
+	if (!sop)
+		return NULL;
+
+	sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+	if (!sop->so_owner.data) {
+		kmem_cache_free(slab, sop);
 		return NULL;
-	idhashval = ownerid_hashval(current_ownerid);
-	INIT_LIST_HEAD(&sop->so_idhash);
-	INIT_LIST_HEAD(&sop->so_strhash);
-	INIT_LIST_HEAD(&sop->so_perclient);
+	}
+	sop->so_owner.len = owner->len;
+
 	INIT_LIST_HEAD(&sop->so_stateids);
-	INIT_LIST_HEAD(&sop->so_perstateid);  /* not used */
-	INIT_LIST_HEAD(&sop->so_close_lru);
-	sop->so_time = 0;
-	list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
-	list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perclient, &clp->cl_openowners);
-	sop->so_is_open_owner = 1;
-	sop->so_id = current_ownerid++;
 	sop->so_client = clp;
-	sop->so_seqid = open->op_seqid;
-	sop->so_confirmed = 0;
-	rp = &sop->so_replay;
-	rp->rp_status = nfserr_serverfault;
-	rp->rp_buflen = 0;
-	rp->rp_buf = rp->rp_ibuf;
+	init_nfs4_replay(&sop->so_replay);
 	return sop;
 }
 
-static inline void
-init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
-	struct nfs4_stateowner *sop = open->op_stateowner;
-	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
+{
+	list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+	list_add(&oo->oo_perclient, &clp->cl_openowners);
+}
 
-	INIT_LIST_HEAD(&stp->st_hash);
-	INIT_LIST_HEAD(&stp->st_perstateowner);
+static struct nfs4_openowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+	struct nfs4_openowner *oo;
+
+	oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+	if (!oo)
+		return NULL;
+	oo->oo_owner.so_is_open_owner = 1;
+	oo->oo_owner.so_seqid = open->op_seqid;
+	oo->oo_flags = NFS4_OO_NEW;
+	oo->oo_time = 0;
+	oo->oo_last_closed_stid = NULL;
+	INIT_LIST_HEAD(&oo->oo_close_lru);
+	hash_openowner(oo, clp, strhashval);
+	return oo;
+}
+
+static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+	struct nfs4_openowner *oo = open->op_openowner;
+	struct nfs4_client *clp = oo->oo_owner.so_client;
+
+	init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
 	INIT_LIST_HEAD(&stp->st_lockowners);
-	INIT_LIST_HEAD(&stp->st_perfile);
-	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
-	list_add(&stp->st_perstateowner, &sop->so_stateids);
+	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
-	stp->st_stateowner = sop;
+	stp->st_stateowner = &oo->oo_owner;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stateid.si_boot = boot_time;
-	stp->st_stateid.si_stateownerid = sop->so_id;
-	stp->st_stateid.si_fileid = fp->fi_id;
-	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
-	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
-		  &stp->st_access_bmap);
+	__set_bit(open->op_share_access, &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
 }
 
 static void
-move_to_close_lru(struct nfs4_stateowner *sop)
+move_to_close_lru(struct nfs4_openowner *oo)
 {
-	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
+	dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
 
-	list_move_tail(&sop->so_close_lru, &close_lru);
-	sop->so_time = get_seconds();
+	list_move_tail(&oo->oo_close_lru, &close_lru);
+	oo->oo_time = get_seconds();
 }
 
 static int
@@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
 		(sop->so_client->cl_clientid.cl_id == clid->cl_id);
 }
 
-static struct nfs4_stateowner *
+static struct nfs4_openowner *
 find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
 {
-	struct nfs4_stateowner *so = NULL;
+	struct nfs4_stateowner *so;
+	struct nfs4_openowner *oo;
 
-	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
-		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
-			return so;
+	list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+		if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
+			oo = openowner(so);
+			renew_client(oo->oo_owner.so_client);
+			return oo;
+		}
 	}
 	return NULL;
 }
@@ -2320,31 +2465,6 @@ find_file(struct inode *ino)
 	return NULL;
 }
 
-static inline int access_valid(u32 x, u32 minorversion)
-{
-	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
-		return 0;
-	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
-		return 0;
-	x &= ~NFS4_SHARE_ACCESS_MASK;
-	if (minorversion && x) {
-		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
-			return 0;
-		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
-			return 0;
-		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
-	}
-	if (x)
-		return 0;
-	return 1;
-}
-
-static inline int deny_valid(u32 x)
-{
-	/* Note: unlike access bits, deny bits may be zero. */
-	return x <= NFS4_SHARE_DENY_BOTH;
-}
-
 /*
  * Called to check deny when READ with all zero stateid or
  * WRITE with all zero or all one stateid
@@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_file *fp;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	__be32 ret;
 
 	dprintk("NFSD: nfs4_share_conflict\n");
@@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.lm_change = nfsd_change_deleg_cb,
 };
 
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+	if (nfsd4_has_session(cstate))
+		return nfs_ok;
+	if (seqid == so->so_seqid - 1)
+		return nfserr_replay_me;
+	if (seqid == so->so_seqid)
+		return nfs_ok;
+	return nfserr_bad_seqid;
+}
 
 __be32
 nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	clientid_t *clientid = &open->op_clientid;
 	struct nfs4_client *clp = NULL;
 	unsigned int strhashval;
-	struct nfs4_stateowner *sop = NULL;
-
-	if (!check_name(open->op_owner))
-		return nfserr_inval;
+	struct nfs4_openowner *oo = NULL;
+	__be32 status;
 
 	if (STALE_CLIENTID(&open->op_clientid))
 		return nfserr_stale_clientid;
+	/*
+	 * In case we need it later, after we've already created the
+	 * file and don't want to risk a further failure:
+	 */
+	open->op_file = nfsd4_alloc_file();
+	if (open->op_file == NULL)
+		return nfserr_jukebox;
 
-	strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
-	sop = find_openstateowner_str(strhashval, open);
-	open->op_stateowner = sop;
-	if (!sop) {
-		/* Make sure the client's lease hasn't expired. */
+	strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+	oo = find_openstateowner_str(strhashval, open);
+	open->op_openowner = oo;
+	if (!oo) {
 		clp = find_confirmed_client(clientid);
 		if (clp == NULL)
 			return nfserr_expired;
-		goto renew;
+		goto new_owner;
 	}
-	/* When sessions are used, skip open sequenceid processing */
-	if (nfsd4_has_session(cstate))
-		goto renew;
-	if (!sop->so_confirmed) {
+	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
 		/* Replace unconfirmed owners without checking for replay. */
-		clp = sop->so_client;
-		release_openowner(sop);
-		open->op_stateowner = NULL;
-		goto renew;
-	}
-	if (open->op_seqid == sop->so_seqid - 1) {
-		if (sop->so_replay.rp_buflen)
-			return nfserr_replay_me;
-		/* The original OPEN failed so spectacularly
-		 * that we don't even have replay data saved!
-		 * Therefore, we have no choice but to continue
-		 * processing this OPEN; presumably, we'll
-		 * fail again for the same reason.
-		 */
-		dprintk("nfsd4_process_open1: replay with no replay cache\n");
-		goto renew;
-	}
-	if (open->op_seqid != sop->so_seqid)
-		return nfserr_bad_seqid;
-renew:
-	if (open->op_stateowner == NULL) {
-		sop = alloc_init_open_stateowner(strhashval, clp, open);
-		if (sop == NULL)
-			return nfserr_resource;
-		open->op_stateowner = sop;
+		clp = oo->oo_owner.so_client;
+		release_openowner(oo);
+		open->op_openowner = NULL;
+		goto new_owner;
 	}
-	list_del_init(&sop->so_close_lru);
-	renew_client(sop->so_client);
+	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+	if (status)
+		return status;
+	clp = oo->oo_owner.so_client;
+	goto alloc_stateid;
+new_owner:
+	oo = alloc_init_open_stateowner(strhashval, clp, open);
+	if (oo == NULL)
+		return nfserr_jukebox;
+	open->op_openowner = oo;
+alloc_stateid:
+	open->op_stp = nfs4_alloc_stateid(clp);
+	if (!open->op_stp)
+		return nfserr_jukebox;
 	return nfs_ok;
 }
 
@@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 		return nfs_ok;
 }
 
-static struct nfs4_delegation *
-find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
+static int share_access_to_flags(u32 share_access)
 {
-	struct nfs4_delegation *dp;
+	share_access &= ~NFS4_SHARE_WANT_MASK;
 
-	spin_lock(&recall_lock);
-	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
-		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
-			spin_unlock(&recall_lock);
-			return dp;
-		}
-	spin_unlock(&recall_lock);
-	return NULL;
+	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
 }
 
-static int share_access_to_flags(u32 share_access)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
 {
-	share_access &= ~NFS4_SHARE_WANT_MASK;
+	struct nfs4_stid *ret;
 
-	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+	ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+	if (!ret)
+		return NULL;
+	return delegstateid(ret);
+}
+
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+	return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+	       open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
 }
 
 static __be32
-nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
 {
 	int flags;
 	__be32 status = nfserr_bad_stateid;
 
-	*dp = find_delegation_file(fp, &open->op_delegate_stateid);
+	*dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (*dp == NULL)
 		goto out;
 	flags = share_access_to_flags(open->op_share_access);
@@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
 	if (status)
 		*dp = NULL;
 out:
-	if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR)
+	if (!nfsd4_is_deleg_cur(open))
 		return nfs_ok;
 	if (status)
 		return status;
-	open->op_stateowner->so_confirmed = 1;
+	open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 	return nfs_ok;
 }
 
 static __be32
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
 {
-	struct nfs4_stateid *local;
-	__be32 status = nfserr_share_denied;
-	struct nfs4_stateowner *sop = open->op_stateowner;
+	struct nfs4_ol_stateid *local;
+	struct nfs4_openowner *oo = open->op_openowner;
 
 	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
 		/* ignore lock owners */
 		if (local->st_stateowner->so_is_open_owner == 0)
 			continue;
 		/* remember if we have seen this open owner */
-		if (local->st_stateowner == sop)
+		if (local->st_stateowner == &oo->oo_owner)
 			*stpp = local;
 		/* check for conflicting share reservations */
 		if (!test_share(local, open))
-			goto out;
+			return nfserr_share_denied;
 	}
-	status = 0;
-out:
-	return status;
+	return nfs_ok;
 }
 
-static inline struct nfs4_stateid *
-nfs4_alloc_stateid(void)
+static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
 {
-	return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
+	kmem_cache_free(stateid_slab, s);
 }
 
 static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
 	int oflag = nfs4_access_to_omode(open->op_share_access);
 	int access = nfs4_access_to_access(open->op_share_access);
 
-	/* CLAIM_DELEGATE_CUR is used in response to a broken lease;
-	 * allowing it to break the lease and return EAGAIN leaves the
-	 * client unable to make progress in returning the delegation */
-	if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
-		access |= NFSD_MAY_NOT_BREAK_LEASE;
-
 	if (!fp->fi_fds[oflag]) {
 		status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
 			&fp->fi_fds[oflag]);
@@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
 	return nfs_ok;
 }
 
-static __be32
-nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
-		struct nfs4_file *fp, struct svc_fh *cur_fh,
-		struct nfsd4_open *open)
-{
-	struct nfs4_stateid *stp;
-	__be32 status;
-
-	stp = nfs4_alloc_stateid();
-	if (stp == NULL)
-		return nfserr_resource;
-
-	status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
-	if (status) {
-		kmem_cache_free(stateid_slab, stp);
-		return status;
-	}
-	*stpp = stp;
-	return 0;
-}
-
 static inline __be32
 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 		struct nfsd4_open *open)
@@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 }
 
 static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
 {
-	u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+	u32 op_share_access = open->op_share_access;
 	bool new_access;
 	__be32 status;
 
@@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
 static void
 nfs4_set_claim_prev(struct nfsd4_open *open)
 {
-	open->op_stateowner->so_confirmed = 1;
-	open->op_stateowner->so_client->cl_firststate = 1;
+	open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+	open->op_openowner->oo_owner.so_client->cl_firststate = 1;
 }
 
 /* Should we give out recallable state?: */
@@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
 	if (!fl)
 		return -ENOMEM;
 	fl->fl_file = find_readable_file(fp);
-	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
 	if (status) {
 		list_del_init(&dp->dl_perclnt);
@@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
 	atomic_inc(&fp->fi_delegees);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	spin_unlock(&recall_lock);
-	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 	return 0;
 }
 
@@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
  * Attempt to hand out a delegation.
  */
 static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
 {
 	struct nfs4_delegation *dp;
-	struct nfs4_stateowner *sop = stp->st_stateowner;
+	struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
 	int cb_up;
 	int status, flag = 0;
 
-	cb_up = nfsd4_cb_channel_good(sop->so_client);
+	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
 	flag = NFS4_OPEN_DELEGATE_NONE;
 	open->op_recall = 0;
 	switch (open->op_claim_type) {
@@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 			 * had the chance to reclaim theirs.... */
 			if (locks_in_grace())
 				goto out;
-			if (!cb_up || !sop->so_confirmed)
+			if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
 				goto out;
 			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
 				flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 			goto out;
 	}
 
-	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
+	dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
 	if (dp == NULL)
 		goto out_no_deleg;
 	status = nfs4_set_delegation(dp, flag);
 	if (status)
 		goto out_free;
 
-	memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
+	memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
 
 	dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
-		STATEID_VAL(&dp->dl_stateid));
+		STATEID_VAL(&dp->dl_stid.sc_stateid));
 out:
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
 			&& flag == NFS4_OPEN_DELEGATE_NONE
@@ -2824,16 +2916,13 @@ __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
-	struct nfs4_stateid *stp = NULL;
+	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	__be32 status;
 
-	status = nfserr_inval;
-	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
-			|| !deny_valid(open->op_share_deny))
-		goto out;
 	/*
 	 * Lookup file; if found, lookup stateid and check open request,
 	 * and check for delegations in the process of being recalled.
@@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	if (fp) {
 		if ((status = nfs4_check_open(fp, open, &stp)))
 			goto out;
-		status = nfs4_check_deleg(fp, open, &dp);
+		status = nfs4_check_deleg(cl, fp, open, &dp);
 		if (status)
 			goto out;
 	} else {
 		status = nfserr_bad_stateid;
-		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
-			goto out;
-		status = nfserr_resource;
-		fp = alloc_init_file(ino);
-		if (fp == NULL)
+		if (nfsd4_is_deleg_cur(open))
 			goto out;
+		status = nfserr_jukebox;
+		fp = open->op_file;
+		open->op_file = NULL;
+		nfsd4_init_file(fp, ino);
 	}
 
 	/*
@@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
 		if (status)
 			goto out;
-		update_stateid(&stp->st_stateid);
 	} else {
-		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
+		status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
 		if (status)
 			goto out;
-		init_stateid(stp, fp, open);
+		stp = open->op_stp;
+		open->op_stp = NULL;
+		init_open_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
 			release_open_stateid(stp);
 			goto out;
 		}
-		if (nfsd4_has_session(&resp->cstate))
-			update_stateid(&stp->st_stateid);
 	}
-	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 	if (nfsd4_has_session(&resp->cstate))
-		open->op_stateowner->so_confirmed = 1;
+		open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
 
 	/*
 	* Attempt to hand out a delegation. No error return, because the
@@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	status = nfs_ok;
 
 	dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
-		STATEID_VAL(&stp->st_stateid));
+		STATEID_VAL(&stp->st_stid.sc_stateid));
 out:
 	if (fp)
 		put_nfs4_file(fp);
@@ -2903,13 +2992,34 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_stateowner->so_confirmed &&
+	if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
 	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
 	return status;
 }
 
+void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+{
+	if (open->op_openowner) {
+		struct nfs4_openowner *oo = open->op_openowner;
+
+		if (!list_empty(&oo->oo_owner.so_stateids))
+			list_del_init(&oo->oo_close_lru);
+		if (oo->oo_flags & NFS4_OO_NEW) {
+			if (status) {
+				release_openowner(oo);
+				open->op_openowner = NULL;
+			} else
+				oo->oo_flags &= ~NFS4_OO_NEW;
+		}
+	}
+	if (open->op_file)
+		nfsd4_free_file(open->op_file);
+	if (open->op_stp)
+		nfs4_free_stateid(open->op_stp);
+}
+
 __be32
 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    clientid_t *clid)
@@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("nfsd4_renew: clientid not found!\n");
 		goto out;
 	}
-	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
 			&& clp->cl_cb_state != NFSD4_CB_UP)
@@ -2962,7 +3071,7 @@ static time_t
 nfs4_laundromat(void)
 {
 	struct nfs4_client *clp;
-	struct nfs4_stateowner *sop;
+	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct list_head *pos, *next, reaplist;
 	time_t cutoff = get_seconds() - nfsd4_lease;
@@ -3019,16 +3128,14 @@ nfs4_laundromat(void)
 	}
 	test_val = nfsd4_lease;
 	list_for_each_safe(pos, next, &close_lru) {
-		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
-		if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
-			u = sop->so_time - cutoff;
+		oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
+		if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
+			u = oo->oo_time - cutoff;
 			if (test_val > u)
 				test_val = u;
 			break;
 		}
-		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
-			sop->so_id);
-		release_openowner(sop);
+		release_openowner(oo);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used)
 	queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
 }
 
-static struct nfs4_stateowner *
-search_close_lru(u32 st_id, int flags)
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-	struct nfs4_stateowner *local = NULL;
-
-	if (flags & CLOSE_STATE) {
-		list_for_each_entry(local, &close_lru, so_close_lru) {
-			if (local->so_id == st_id)
-				return local;
-		}
-	}
-	return NULL;
-}
-
-static inline int
-nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
-{
-	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+	if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+		return nfserr_bad_stateid;
+	return nfs_ok;
 }
 
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-	if (stateid->si_boot == boot_time)
+	if (stateid->si_opaque.so_clid.cl_boot == boot_time)
 		return 0;
 	dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
 		STATEID_VAL(stateid));
@@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap)
 }
 
 static
-__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
 {
         __be32 status = nfserr_openmode;
 
@@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode)
 	return locks_in_grace() && mandatory_lock(inode);
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
+/* Returns true iff a is later than b: */
+static bool stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+	return (s32)a->si_generation - (s32)b->si_generation > 0;
+}
+
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
 	/*
 	 * When sessions are used the stateid generation number is ignored
 	 * when it is zero.
 	 */
-	if ((flags & HAS_SESSION) && in->si_generation == 0)
-		goto out;
+	if (has_session && in->si_generation == 0)
+		return nfs_ok;
+
+	if (in->si_generation == ref->si_generation)
+		return nfs_ok;
 
 	/* If the client sends us a stateid from the future, it's buggy: */
-	if (in->si_generation > ref->si_generation)
+	if (stateid_generation_after(in, ref))
 		return nfserr_bad_stateid;
 	/*
-	 * The following, however, can happen.  For example, if the
-	 * client sends an open and some IO at the same time, the open
-	 * may bump si_generation while the IO is still in flight.
-	 * Thanks to hard links and renames, the client never knows what
-	 * file an open will affect.  So it could avoid that situation
-	 * only by serializing all opens and IO from the same open
-	 * owner.  To recover from the old_stateid error, the client
-	 * will just have to retry the IO:
+	 * However, we could see a stateid from the past, even from a
+	 * non-buggy client.  For example, if the client sends a lock
+	 * while some IO is outstanding, the lock may bump si_generation
+	 * while the IO is still in flight.  The client could avoid that
+	 * situation by waiting for responses on all the IO requests,
+	 * but better performance may result in retrying IO that
+	 * receives an old_stateid error if requests are rarely
+	 * reordered in flight:
 	 */
-	if (in->si_generation < ref->si_generation)
-		return nfserr_old_stateid;
-out:
-	return nfs_ok;
+	return nfserr_old_stateid;
 }
 
-static int is_delegation_stateid(stateid_t *stateid)
+__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 {
-	return stateid->si_fileid == 0;
-}
+	struct nfs4_stid *s;
+	struct nfs4_ol_stateid *ols;
+	__be32 status;
 
-static int is_open_stateid(struct nfs4_stateid *stateid)
-{
-	return stateid->st_openstp == NULL;
+	if (STALE_STATEID(stateid))
+		return nfserr_stale_stateid;
+
+	s = find_stateid(cl, stateid);
+	if (!s)
+		 return nfserr_stale_stateid;
+	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+	if (status)
+		return status;
+	if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+		return nfs_ok;
+	ols = openlockstateid(s);
+	if (ols->st_stateowner->so_is_open_owner
+	    && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+		return nfserr_bad_stateid;
+	return nfs_ok;
 }
 
-__be32 nfs4_validate_stateid(stateid_t *stateid, int flags)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
 {
-	struct nfs4_stateid *stp = NULL;
-	__be32 status = nfserr_stale_stateid;
+	struct nfs4_client *cl;
 
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		return nfserr_bad_stateid;
 	if (STALE_STATEID(stateid))
-		goto out;
-
-	status = nfserr_expired;
-	stp = search_for_stateid(stateid);
-	if (!stp)
-		goto out;
-	status = nfserr_bad_stateid;
-
-	if (!stp->st_stateowner->so_confirmed)
-		goto out;
-
-	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
-	if (status)
-		goto out;
+		return nfserr_stale_stateid;
+	cl = find_confirmed_client(&stateid->si_opaque.so_clid);
+	if (!cl)
+		return nfserr_expired;
+	*s = find_stateid_by_type(cl, stateid, typemask);
+	if (!*s)
+		return nfserr_bad_stateid;
+	return nfs_ok;
 
-	status = nfs_ok;
-out:
-	return status;
 }
 
 /*
@@ -3210,7 +3316,8 @@ __be32
 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			   stateid_t *stateid, int flags, struct file **filpp)
 {
-	struct nfs4_stateid *stp = NULL;
+	struct nfs4_stid *s;
+	struct nfs4_ol_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	struct svc_fh *current_fh = &cstate->current_fh;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	if (grace_disallows_io(ino))
 		return nfserr_grace;
 
-	if (nfsd4_has_session(cstate))
-		flags |= HAS_SESSION;
-
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
-	status = nfserr_stale_stateid;
-	if (STALE_STATEID(stateid)) 
+	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
+	if (status)
+		return status;
+	status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+	if (status)
 		goto out;
-
-	/*
-	 * We assume that any stateid that has the current boot time,
-	 * but that we can't find, is expired:
-	 */
-	status = nfserr_expired;
-	if (is_delegation_stateid(stateid)) {
-		dp = find_delegation_stateid(ino, stateid);
-		if (!dp)
-			goto out;
-		status = check_stateid_generation(stateid, &dp->dl_stateid,
-						  flags);
-		if (status)
-			goto out;
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
+		dp = delegstateid(s);
 		status = nfs4_check_delegmode(dp, flags);
 		if (status)
 			goto out;
-		renew_client(dp->dl_client);
 		if (filpp) {
 			*filpp = dp->dl_file->fi_deleg_file;
 			BUG_ON(!*filpp);
 		}
-	} else { /* open or lock stateid */
-		stp = find_stateid(stateid, flags);
-		if (!stp)
-			goto out;
-		status = nfserr_bad_stateid;
-		if (nfs4_check_fh(current_fh, stp))
-			goto out;
-		if (!stp->st_stateowner->so_confirmed)
-			goto out;
-		status = check_stateid_generation(stateid, &stp->st_stateid,
-						  flags);
+		break;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		stp = openlockstateid(s);
+		status = nfs4_check_fh(current_fh, stp);
 		if (status)
 			goto out;
+		if (stp->st_stateowner->so_is_open_owner
+		    && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+			goto out;
 		status = nfs4_check_openmode(stp, flags);
 		if (status)
 			goto out;
-		renew_client(stp->st_stateowner->so_client);
 		if (filpp) {
 			if (flags & RD_STATE)
 				*filpp = find_readable_file(stp->st_file);
 			else
 				*filpp = find_writeable_file(stp->st_file);
 		}
+		break;
+	default:
+		return nfserr_bad_stateid;
 	}
 	status = nfs_ok;
 out:
@@ -3283,18 +3377,9 @@ out:
 }
 
 static __be32
-nfsd4_free_delegation_stateid(stateid_t *stateid)
+nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
 {
-	struct nfs4_delegation *dp = search_for_delegation(stateid);
-	if (dp)
-		return nfserr_locks_held;
-	return nfserr_bad_stateid;
-}
-
-static __be32
-nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
-{
-	if (check_for_locks(stp->st_file, stp->st_stateowner))
+	if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
 		return nfserr_locks_held;
 	release_lock_stateid(stp);
 	return nfs_ok;
@@ -3307,51 +3392,40 @@ __be32
 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_test_stateid *test_stateid)
 {
-	test_stateid->ts_has_session = nfsd4_has_session(cstate);
+	/* real work is done during encoding */
 	return nfs_ok;
 }
 
-/*
- * Free a state id
- */
 __be32
 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_free_stateid *free_stateid)
 {
 	stateid_t *stateid = &free_stateid->fr_stateid;
-	struct nfs4_stateid *stp;
-	__be32 ret;
+	struct nfs4_stid *s;
+	struct nfs4_client *cl = cstate->session->se_client;
+	__be32 ret = nfserr_bad_stateid;
 
 	nfs4_lock_state();
-	if (is_delegation_stateid(stateid)) {
-		ret = nfsd4_free_delegation_stateid(stateid);
-		goto out;
-	}
-
-	stp = search_for_stateid(stateid);
-	if (!stp) {
-		ret = nfserr_bad_stateid;
+	s = find_stateid(cl, stateid);
+	if (!s)
 		goto out;
-	}
-	if (stateid->si_generation != 0) {
-		if (stateid->si_generation < stp->st_stateid.si_generation) {
-			ret = nfserr_old_stateid;
-			goto out;
-		}
-		if (stateid->si_generation > stp->st_stateid.si_generation) {
-			ret = nfserr_bad_stateid;
-			goto out;
-		}
-	}
-
-	if (is_open_stateid(stp)) {
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
 		ret = nfserr_locks_held;
 		goto out;
-	} else {
-		ret = nfsd4_free_lock_stateid(stp);
-		goto out;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+		if (ret)
+			goto out;
+		if (s->sc_type == NFS4_LOCK_STID)
+			ret = nfsd4_free_lock_stateid(openlockstateid(s));
+		else
+			ret = nfserr_locks_held;
+		break;
+	default:
+		ret = nfserr_bad_stateid;
 	}
-
 out:
 	nfs4_unlock_state();
 	return ret;
@@ -3364,124 +3438,64 @@ setlkflg (int type)
 		RD_STATE : WR_STATE;
 }
 
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	struct nfs4_stateowner *sop = stp->st_stateowner;
+	__be32 status;
+
+	status = nfsd4_check_seqid(cstate, sop, seqid);
+	if (status)
+		return status;
+	if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+		/*
+		 * "Closed" stateid's exist *only* to return
+		 * nfserr_replay_me from the previous step.
+		 */
+		return nfserr_bad_stateid;
+	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+	if (status)
+		return status;
+	return nfs4_check_fh(current_fh, stp);
+}
+
 /* 
  * Checks for sequence id mutating operations. 
  */
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
-			 stateid_t *stateid, int flags,
-			 struct nfs4_stateowner **sopp,
-			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+			 stateid_t *stateid, char typemask,
+			 struct nfs4_ol_stateid **stpp)
 {
-	struct nfs4_stateid *stp;
-	struct nfs4_stateowner *sop;
-	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
+	struct nfs4_stid *s;
 
 	dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
 		seqid, STATEID_VAL(stateid));
 
 	*stpp = NULL;
-	*sopp = NULL;
-
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
-		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
-		return nfserr_bad_stateid;
-	}
-
-	if (STALE_STATEID(stateid))
-		return nfserr_stale_stateid;
-
-	if (nfsd4_has_session(cstate))
-		flags |= HAS_SESSION;
-
-	/*
-	* We return BAD_STATEID if filehandle doesn't match stateid, 
-	* the confirmed flag is incorrecly set, or the generation 
-	* number is incorrect.  
-	*/
-	stp = find_stateid(stateid, flags);
-	if (stp == NULL) {
-		/*
-		 * Also, we should make sure this isn't just the result of
-		 * a replayed close:
-		 */
-		sop = search_close_lru(stateid->si_stateownerid, flags);
-		/* It's not stale; let's assume it's expired: */
-		if (sop == NULL)
-			return nfserr_expired;
-		*sopp = sop;
-		goto check_replay;
-	}
-
-	*stpp = stp;
-	*sopp = sop = stp->st_stateowner;
-
-	if (lock) {
-		clientid_t *lockclid = &lock->v.new.clientid;
-		struct nfs4_client *clp = sop->so_client;
-		int lkflg = 0;
-		__be32 status;
-
-		lkflg = setlkflg(lock->lk_type);
-
-		if (lock->lk_is_new) {
-			if (!sop->so_is_open_owner)
-				return nfserr_bad_stateid;
-			if (!(flags & HAS_SESSION) &&
-			    !same_clid(&clp->cl_clientid, lockclid))
-				return nfserr_bad_stateid;
-			/* stp is the open stateid */
-			status = nfs4_check_openmode(stp, lkflg);
-			if (status)
-				return status;
-		} else {
-			/* stp is the lock stateid */
-			status = nfs4_check_openmode(stp->st_openstp, lkflg);
-			if (status)
-				return status;
-               }
-	}
+	status = nfsd4_lookup_stateid(stateid, typemask, &s);
+	if (status)
+		return status;
+	*stpp = openlockstateid(s);
+	cstate->replay_owner = (*stpp)->st_stateowner;
 
-	if (nfs4_check_fh(current_fh, stp)) {
-		dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
-		return nfserr_bad_stateid;
-	}
+	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
+}
 
-	/*
-	*  We now validate the seqid and stateid generation numbers.
-	*  For the moment, we ignore the possibility of 
-	*  generation number wraparound.
-	*/
-	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
-		goto check_replay;
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
+{
+	__be32 status;
+	struct nfs4_openowner *oo;
 
-	if (sop->so_confirmed && flags & CONFIRM) {
-		dprintk("NFSD: preprocess_seqid_op: expected"
-				" unconfirmed stateowner!\n");
-		return nfserr_bad_stateid;
-	}
-	if (!sop->so_confirmed && !(flags & CONFIRM)) {
-		dprintk("NFSD: preprocess_seqid_op: stateowner not"
-				" confirmed yet!\n");
-		return nfserr_bad_stateid;
-	}
-	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
+	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
+						NFS4_OPEN_STID, stpp);
 	if (status)
 		return status;
-	renew_client(sop->so_client);
+	oo = openowner((*stpp)->st_stateowner);
+	if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
+		return nfserr_bad_stateid;
 	return nfs_ok;
-
-check_replay:
-	if (seqid == sop->so_seqid - 1) {
-		dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
-		/* indicate replay to calling function */
-		return nfserr_replay_me;
-	}
-	dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
-			sop->so_seqid, seqid);
-	*sopp = NULL;
-	return nfserr_bad_seqid;
 }
 
 __be32
@@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_open_confirm *oc)
 {
 	__be32 status;
-	struct nfs4_stateowner *sop;
-	struct nfs4_stateid *stp;
+	struct nfs4_openowner *oo;
+	struct nfs4_ol_stateid *stp;
 
 	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
 			(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 
-	if ((status = nfs4_preprocess_seqid_op(cstate,
+	status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
-					CONFIRM | OPEN_STATE,
-					&oc->oc_stateowner, &stp, NULL)))
-		goto out; 
-
-	sop = oc->oc_stateowner;
-	sop->so_confirmed = 1;
-	update_stateid(&stp->st_stateid);
-	memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
+					NFS4_OPEN_STID, &stp);
+	if (status)
+		goto out;
+	oo = openowner(stp->st_stateowner);
+	status = nfserr_bad_stateid;
+	if (oo->oo_flags & NFS4_OO_CONFIRMED)
+		goto out;
+	oo->oo_flags |= NFS4_OO_CONFIRMED;
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
-		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
+		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
 
-	nfsd4_create_clid_dir(sop->so_client);
+	nfsd4_create_clid_dir(oo->oo_owner.so_client);
+	status = nfs_ok;
 out:
-	if (oc->oc_stateowner) {
-		nfs4_get_stateowner(oc->oc_stateowner);
-		cstate->replay_owner = oc->oc_stateowner;
-	}
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
-static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access)
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
 {
-	int i;
+	if (!test_bit(access, &stp->st_access_bmap))
+		return;
+	nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+	__clear_bit(access, &stp->st_access_bmap);
+}
 
-	for (i = 1; i < 4; i++) {
-		if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) {
-			nfs4_file_put_access(stp->st_file, i);
-			__clear_bit(i, &stp->st_access_bmap);
-		}
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
+{
+	switch (to_access) {
+	case NFS4_SHARE_ACCESS_READ:
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+		break;
+	case NFS4_SHARE_ACCESS_WRITE:
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+		nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+		break;
+	case NFS4_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		BUG();
 	}
 }
 
@@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		     struct nfsd4_open_downgrade *od)
 {
 	__be32 status;
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 
 	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
-	if (!access_valid(od->od_share_access, cstate->minorversion)
-			|| !deny_valid(od->od_share_deny))
-		return nfserr_inval;
+	/* We don't yet support WANT bits: */
+	od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
 
 	nfs4_lock_state();
-	if ((status = nfs4_preprocess_seqid_op(cstate,
-					od->od_seqid,
-					&od->od_stateid, 
-					OPEN_STATE,
-					&od->od_stateowner, &stp, NULL)))
+	status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
+					&od->od_stateid, &stp);
+	if (status)
 		goto out; 
-
 	status = nfserr_inval;
 	if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
 		dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			stp->st_deny_bmap, od->od_share_deny);
 		goto out;
 	}
-	nfs4_file_downgrade(stp, od->od_share_access);
+	nfs4_stateid_downgrade(stp, od->od_share_access);
 
 	reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
 
-	update_stateid(&stp->st_stateid);
-	memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 out:
-	if (od->od_stateowner) {
-		nfs4_get_stateowner(od->od_stateowner);
-		cstate->replay_owner = od->od_stateowner;
-	}
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
+void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
+{
+	struct nfs4_openowner *oo;
+	struct nfs4_ol_stateid *s;
+
+	if (!so->so_is_open_owner)
+		return;
+	oo = openowner(so);
+	s = oo->oo_last_closed_stid;
+	if (!s)
+		return;
+	if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
+		/* Release the last_closed_stid on the next seqid bump: */
+		oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
+		return;
+	}
+	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
+	release_last_closed_stateid(oo);
+}
+
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+	unhash_open_stateid(s);
+	s->st_stid.sc_type = NFS4_CLOSED_STID;
+}
+
 /*
  * nfs4_unlock_state() called after encode
  */
@@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_close *close)
 {
 	__be32 status;
-	struct nfs4_stateid *stp;
+	struct nfs4_openowner *oo;
+	struct nfs4_ol_stateid *stp;
 
 	dprintk("NFSD: nfsd4_close on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
 	nfs4_lock_state();
-	/* check close_lru for replay */
-	if ((status = nfs4_preprocess_seqid_op(cstate,
-					close->cl_seqid,
-					&close->cl_stateid, 
-					OPEN_STATE | CLOSE_STATE,
-					&close->cl_stateowner, &stp, NULL)))
+	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
+					&close->cl_stateid,
+					NFS4_OPEN_STID|NFS4_CLOSED_STID,
+					&stp);
+	if (status)
 		goto out; 
+	oo = openowner(stp->st_stateowner);
 	status = nfs_ok;
-	update_stateid(&stp->st_stateid);
-	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
-	/* release_stateid() calls nfsd_close() if needed */
-	release_open_stateid(stp);
+	nfsd4_close_open_stateid(stp);
+	oo->oo_last_closed_stid = stp;
 
 	/* place unused nfs4_stateowners on so_close_lru list to be
 	 * released by the laundromat service after the lease period
 	 * to enable us to handle CLOSE replay
 	 */
-	if (list_empty(&close->cl_stateowner->so_stateids))
-		move_to_close_lru(close->cl_stateowner);
+	if (list_empty(&oo->oo_owner.so_stateids))
+		move_to_close_lru(oo);
 out:
-	if (close->cl_stateowner) {
-		nfs4_get_stateowner(close->cl_stateowner);
-		cstate->replay_owner = close->cl_stateowner;
-	}
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct nfs4_delegation *dp;
 	stateid_t *stateid = &dr->dr_stateid;
+	struct nfs4_stid *s;
 	struct inode *inode;
 	__be32 status;
-	int flags = 0;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		return status;
 	inode = cstate->current_fh.fh_dentry->d_inode;
 
-	if (nfsd4_has_session(cstate))
-		flags |= HAS_SESSION;
 	nfs4_lock_state();
-	status = nfserr_bad_stateid;
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-		goto out;
-	status = nfserr_stale_stateid;
-	if (STALE_STATEID(stateid))
-		goto out;
-	status = nfserr_bad_stateid;
-	if (!is_delegation_stateid(stateid))
-		goto out;
-	status = nfserr_expired;
-	dp = find_delegation_stateid(inode, stateid);
-	if (!dp)
+	status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
+	if (status)
 		goto out;
-	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+	dp = delegstateid(s);
+	status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
 	if (status)
 		goto out;
-	renew_client(dp->dl_client);
 
 	unhash_delegation(dp);
 out:
@@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len)
 	return end > start ? end - 1: NFS4_MAX_UINT64;
 }
 
-#define lockownerid_hashval(id) \
-        ((id) & LOCK_HASH_MASK)
-
 static inline unsigned int
 lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 		struct xdr_netobj *ownername)
@@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
 		& LOCK_HASH_MASK;
 }
 
-static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
-static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
-
-static int
-same_stateid(stateid_t *id_one, stateid_t *id_two)
-{
-	if (id_one->si_stateownerid != id_two->si_stateownerid)
-		return 0;
-	return id_one->si_fileid == id_two->si_fileid;
-}
-
-static struct nfs4_stateid *
-find_stateid(stateid_t *stid, int flags)
-{
-	struct nfs4_stateid *local;
-	u32 st_id = stid->si_stateownerid;
-	u32 f_id = stid->si_fileid;
-	unsigned int hashval;
-
-	dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-	if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
-		hashval = stateid_hashval(st_id, f_id);
-		list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
-			if ((local->st_stateid.si_stateownerid == st_id) &&
-			    (local->st_stateid.si_fileid == f_id))
-				return local;
-		}
-	} 
-
-	if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
-		hashval = stateid_hashval(st_id, f_id);
-		list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
-			if ((local->st_stateid.si_stateownerid == st_id) &&
-			    (local->st_stateid.si_fileid == f_id))
-				return local;
-		}
-	}
-	return NULL;
-}
-
-static struct nfs4_stateid *
-search_for_stateid(stateid_t *stid)
-{
-	struct nfs4_stateid *local;
-	unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid);
-
-	list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
-		if (same_stateid(&local->st_stateid, stid))
-			return local;
-	}
-
-	list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
-		if (same_stateid(&local->st_stateid, stid))
-			return local;
-	}
-	return NULL;
-}
-
-static struct nfs4_delegation *
-search_for_delegation(stateid_t *stid)
-{
-	struct nfs4_file *fp;
-	struct nfs4_delegation *dp;
-	struct list_head *pos;
-	int i;
-
-	for (i = 0; i < FILE_HASH_SIZE; i++) {
-		list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
-			list_for_each(pos, &fp->fi_delegations) {
-				dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
-				if (same_stateid(&dp->dl_stateid, stid))
-					return dp;
-			}
-		}
-	}
-	return NULL;
-}
-
-static struct nfs4_delegation *
-find_delegation_stateid(struct inode *ino, stateid_t *stid)
-{
-	struct nfs4_file *fp;
-	struct nfs4_delegation *dl;
-
-	dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
-		STATEID_VAL(stid));
-
-	fp = find_file(ino);
-	if (!fp)
-		return NULL;
-	dl = find_delegation_file(fp, stid);
-	put_nfs4_file(fp);
-	return dl;
-}
 
 /*
  * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops  = {
 static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
-	struct nfs4_stateowner *sop;
+	struct nfs4_lockowner *lo;
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
-		sop = (struct nfs4_stateowner *) fl->fl_owner;
-		kref_get(&sop->so_ref);
-		deny->ld_sop = sop;
-		deny->ld_clientid = sop->so_client->cl_clientid;
+		lo = (struct nfs4_lockowner *) fl->fl_owner;
+		deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
+					lo->lo_owner.so_owner.len, GFP_KERNEL);
+		if (!deny->ld_owner.data)
+			/* We just don't care that much */
+			goto nevermind;
+		deny->ld_owner.len = lo->lo_owner.so_owner.len;
+		deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
 	} else {
-		deny->ld_sop = NULL;
+nevermind:
+		deny->ld_owner.len = 0;
+		deny->ld_owner.data = NULL;
 		deny->ld_clientid.cl_boot = 0;
 		deny->ld_clientid.cl_id = 0;
 	}
@@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 		deny->ld_type = NFS4_WRITE_LT;
 }
 
-static struct nfs4_stateowner *
-find_lockstateowner_str(struct inode *inode, clientid_t *clid,
+static struct nfs4_lockowner *
+find_lockowner_str(struct inode *inode, clientid_t *clid,
 		struct xdr_netobj *owner)
 {
 	unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
@@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
 
 	list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
 		if (same_owner_str(op, owner, clid))
-			return op;
+			return lockowner(op);
 	}
 	return NULL;
 }
 
+static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
+{
+	list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+	list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
+}
+
 /*
  * Alloc a lock owner structure.
  * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
@@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
  * strhashval = lock_ownerstr_hashval 
  */
 
-static struct nfs4_stateowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
-	struct nfs4_stateowner *sop;
-	struct nfs4_replay *rp;
-	unsigned int idhashval;
+static struct nfs4_lockowner *
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
+	struct nfs4_lockowner *lo;
 
-	if (!(sop = alloc_stateowner(&lock->lk_new_owner)))
+	lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+	if (!lo)
 		return NULL;
-	idhashval = lockownerid_hashval(current_ownerid);
-	INIT_LIST_HEAD(&sop->so_idhash);
-	INIT_LIST_HEAD(&sop->so_strhash);
-	INIT_LIST_HEAD(&sop->so_perclient);
-	INIT_LIST_HEAD(&sop->so_stateids);
-	INIT_LIST_HEAD(&sop->so_perstateid);
-	INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
-	sop->so_time = 0;
-	list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
-	list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perstateid, &open_stp->st_lockowners);
-	sop->so_is_open_owner = 0;
-	sop->so_id = current_ownerid++;
-	sop->so_client = clp;
+	INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
+	lo->lo_owner.so_is_open_owner = 0;
 	/* It is the openowner seqid that will be incremented in encode in the
 	 * case of new lockowners; so increment the lock seqid manually: */
-	sop->so_seqid = lock->lk_new_lock_seqid + 1;
-	sop->so_confirmed = 1;
-	rp = &sop->so_replay;
-	rp->rp_status = nfserr_serverfault;
-	rp->rp_buflen = 0;
-	rp->rp_buf = rp->rp_ibuf;
-	return sop;
+	lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
+	hash_lockowner(lo, strhashval, clp, open_stp);
+	return lo;
 }
 
-static struct nfs4_stateid *
-alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+static struct nfs4_ol_stateid *
+alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
 {
-	struct nfs4_stateid *stp;
-	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+	struct nfs4_ol_stateid *stp;
+	struct nfs4_client *clp = lo->lo_owner.so_client;
 
-	stp = nfs4_alloc_stateid();
+	stp = nfs4_alloc_stateid(clp);
 	if (stp == NULL)
-		goto out;
-	INIT_LIST_HEAD(&stp->st_hash);
-	INIT_LIST_HEAD(&stp->st_perfile);
-	INIT_LIST_HEAD(&stp->st_perstateowner);
-	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
-	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+		return NULL;
+	init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
-	list_add(&stp->st_perstateowner, &sop->so_stateids);
-	stp->st_stateowner = sop;
+	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+	stp->st_stateowner = &lo->lo_owner;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stateid.si_boot = boot_time;
-	stp->st_stateid.si_stateownerid = sop->so_id;
-	stp->st_stateid.si_fileid = fp->fi_id;
-	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
-
-out:
 	return stp;
 }
 
@@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length)
 	     LOFF_OVERFLOW(offset, length)));
 }
 
-static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
 {
 	struct nfs4_file *fp = lock_stp->st_file;
 	int oflag = nfs4_access_to_omode(access);
@@ -3978,15 +3899,16 @@ __be32
 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_lock *lock)
 {
-	struct nfs4_stateowner *open_sop = NULL;
-	struct nfs4_stateowner *lock_sop = NULL;
-	struct nfs4_stateid *lock_stp;
+	struct nfs4_openowner *open_sop = NULL;
+	struct nfs4_lockowner *lock_sop = NULL;
+	struct nfs4_ol_stateid *lock_stp;
 	struct nfs4_file *fp;
 	struct file *filp = NULL;
 	struct file_lock file_lock;
 	struct file_lock conflock;
 	__be32 status = 0;
 	unsigned int strhashval;
+	int lkflg;
 	int err;
 
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * Use open owner and open stateid to create lock owner and
 		 * lock stateid.
 		 */
-		struct nfs4_stateid *open_stp = NULL;
+		struct nfs4_ol_stateid *open_stp = NULL;
 		
 		status = nfserr_stale_clientid;
 		if (!nfsd4_has_session(cstate) &&
@@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 
 		/* validate and update open stateid and open seqid */
-		status = nfs4_preprocess_seqid_op(cstate,
+		status = nfs4_preprocess_confirmed_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
-					OPEN_STATE,
-		                        &lock->lk_replay_owner, &open_stp,
-					lock);
+					&open_stp);
 		if (status)
 			goto out;
-		open_sop = lock->lk_replay_owner;
+		open_sop = openowner(open_stp->st_stateowner);
+		status = nfserr_bad_stateid;
+		if (!nfsd4_has_session(cstate) &&
+			!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+						&lock->v.new.clientid))
+			goto out;
 		/* create lockowner and lock stateid */
 		fp = open_stp->st_file;
-		strhashval = lock_ownerstr_hashval(fp->fi_inode, 
-				open_sop->so_client->cl_clientid.cl_id, 
+		strhashval = lock_ownerstr_hashval(fp->fi_inode,
+				open_sop->oo_owner.so_client->cl_clientid.cl_id,
 				&lock->v.new.owner);
 		/* XXX: Do we need to check for duplicate stateowners on
 		 * the same file, or should they just be allowed (and
 		 * create new stateids)? */
-		status = nfserr_resource;
+		status = nfserr_jukebox;
 		lock_sop = alloc_init_lock_stateowner(strhashval,
-				open_sop->so_client, open_stp, lock);
+				open_sop->oo_owner.so_client, open_stp, lock);
 		if (lock_sop == NULL)
 			goto out;
 		lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
 		status = nfs4_preprocess_seqid_op(cstate,
-				       lock->lk_old_lock_seqid, 
-				       &lock->lk_old_lock_stateid, 
-				       LOCK_STATE,
-				       &lock->lk_replay_owner, &lock_stp, lock);
+				       lock->lk_old_lock_seqid,
+				       &lock->lk_old_lock_stateid,
+				       NFS4_LOCK_STID, &lock_stp);
 		if (status)
 			goto out;
-		lock_sop = lock->lk_replay_owner;
+		lock_sop = lockowner(lock_stp->st_stateowner);
 		fp = lock_stp->st_file;
 	}
-	/* lock->lk_replay_owner and lock_stp have been created or found */
+	/* lock_sop and lock_stp have been created or found */
+
+	lkflg = setlkflg(lock->lk_type);
+	status = nfs4_check_openmode(lock_stp, lkflg);
+	if (status)
+		goto out;
 
 	status = nfserr_grace;
 	if (locks_in_grace() && !lock->lk_reclaim)
@@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
 	switch (-err) {
 	case 0: /* success! */
-		update_stateid(&lock_stp->st_stateid);
-		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 
+		update_stateid(&lock_stp->st_stid.sc_stateid);
+		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, 
 				sizeof(stateid_t));
 		status = 0;
 		break;
@@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	case (EDEADLK):
 		status = nfserr_deadlock;
 		break;
-	default:        
+	default:
 		dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
-		status = nfserr_resource;
+		status = nfserrno(err);
 		break;
 	}
 out:
 	if (status && lock->lk_is_new && lock_sop)
 		release_lockowner(lock_sop);
-	if (lock->lk_replay_owner) {
-		nfs4_get_stateowner(lock->lk_replay_owner);
-		cstate->replay_owner = lock->lk_replay_owner;
-	}
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 }
 
@@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct inode *inode;
 	struct file_lock file_lock;
+	struct nfs4_lockowner *lo;
 	int error;
 	__be32 status;
 
@@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (check_lock_length(lockt->lt_offset, lockt->lt_length))
 		 return nfserr_inval;
 
-	lockt->lt_stateowner = NULL;
 	nfs4_lock_state();
 
 	status = nfserr_stale_clientid;
 	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
 		goto out;
 
-	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
-		dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
-		if (status == nfserr_symlink)
-			status = nfserr_inval;
+	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		goto out;
-	}
 
 	inode = cstate->current_fh.fh_dentry->d_inode;
 	locks_init_lock(&file_lock);
@@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	lockt->lt_stateowner = find_lockstateowner_str(inode,
-			&lockt->lt_clientid, &lockt->lt_owner);
-	if (lockt->lt_stateowner)
-		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
+	lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+	if (lo)
+		file_lock.fl_owner = (fl_owner_t)lo;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
 
@@ -4234,7 +4155,7 @@ __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_locku *locku)
 {
-	struct nfs4_stateid *stp;
+	struct nfs4_ol_stateid *stp;
 	struct file *filp = NULL;
 	struct file_lock file_lock;
 	__be32 status;
@@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 									        
-	if ((status = nfs4_preprocess_seqid_op(cstate,
-					locku->lu_seqid, 
-					&locku->lu_stateid, 
-					LOCK_STATE,
-					&locku->lu_stateowner, &stp, NULL)))
+	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
+					&locku->lu_stateid, NFS4_LOCK_STID, &stp);
+	if (status)
 		goto out;
-
 	filp = find_any_file(stp->st_file);
 	if (!filp) {
 		status = nfserr_lock_range;
@@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	BUG_ON(!filp);
 	locks_init_lock(&file_lock);
 	file_lock.fl_type = F_UNLCK;
-	file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner;
+	file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX; 
@@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/*
 	* OK, unlock succeeded; the only thing left to do is update the stateid.
 	*/
-	update_stateid(&stp->st_stateid);
-	memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
+	update_stateid(&stp->st_stid.sc_stateid);
+	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 out:
-	if (locku->lu_stateowner) {
-		nfs4_get_stateowner(locku->lu_stateowner);
-		cstate->replay_owner = locku->lu_stateowner;
-	}
-	nfs4_unlock_state();
+	if (!cstate->replay_owner)
+		nfs4_unlock_state();
 	return status;
 
 out_nfserr:
@@ -4307,7 +4222,7 @@ out_nfserr:
  * 	0: no locks held by lockowner
  */
 static int
-check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 {
 	struct file_lock **flpp;
 	struct inode *inode = filp->fi_inode;
@@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 {
 	clientid_t *clid = &rlockowner->rl_clientid;
 	struct nfs4_stateowner *sop;
-	struct nfs4_stateid *stp;
+	struct nfs4_lockowner *lo;
+	struct nfs4_ol_stateid *stp;
 	struct xdr_netobj *owner = &rlockowner->rl_owner;
 	struct list_head matches;
 	int i;
@@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	 * data structures. */
 	INIT_LIST_HEAD(&matches);
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
-		list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
+		list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
 			if (!same_owner_str(sop, owner, clid))
 				continue;
 			list_for_each_entry(stp, &sop->so_stateids,
 					st_perstateowner) {
-				if (check_for_locks(stp->st_file, sop))
+				lo = lockowner(sop);
+				if (check_for_locks(stp->st_file, lo))
 					goto out;
-				/* Note: so_perclient unused for lockowners,
-				 * so it's OK to fool with here. */
-				list_add(&sop->so_perclient, &matches);
+				list_add(&lo->lo_list, &matches);
 			}
 		}
 	}
@@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	 * have been checked. */
 	status = nfs_ok;
 	while (!list_empty(&matches)) {
-		sop = list_entry(matches.next, struct nfs4_stateowner,
-								so_perclient);
+		lo = list_entry(matches.next, struct nfs4_lockowner,
+								lo_list);
 		/* unhash_stateowner deletes so_perclient only
 		 * for openowners. */
-		list_del(&sop->so_perclient);
-		release_lockowner(sop);
+		list_del(&lo->lo_list);
+		release_lockowner(lo);
 	}
 out:
 	nfs4_unlock_state();
@@ -4501,16 +4416,10 @@ nfs4_state_init(void)
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
-	for (i = 0; i < OWNER_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
-		INIT_LIST_HEAD(&ownerid_hashtbl[i]);
-	}
-	for (i = 0; i < STATEID_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&stateid_hashtbl[i]);
-		INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
+	for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
 	}
 	for (i = 0; i < LOCK_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
 		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
 	}
 	memset(&onestateid, ~0, sizeof(stateid_t));
@@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void)
 	int status;
 
 	nfs4_lock_state();
-	nfsd4_init_recdir(user_recovery_dirname);
+	nfsd4_init_recdir();
 	status = nfsd4_recdir_load();
 	nfs4_unlock_state();
 	if (status)
@@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void)
 	nfs4_unlock_state();
 	nfsd4_destroy_callback_queue();
 }
-
-/*
- * user_recovery_dirname is protected by the nfsd_mutex since it's only
- * accessed when nfsd is starting.
- */
-static void
-nfs4_set_recdir(char *recdir)
-{
-	strcpy(user_recovery_dirname, recdir);
-}
-
-/*
- * Change the NFSv4 recovery directory to recdir.
- */
-int
-nfs4_reset_recoverydir(char *recdir)
-{
-	int status;
-	struct path path;
-
-	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
-	if (status)
-		return status;
-	status = -ENOTDIR;
-	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
-		nfs4_set_recdir(recdir);
-		status = 0;
-	}
-	path_put(&path);
-	return status;
-}
-
-char *
-nfs4_recoverydir(void)
-{
-	return user_recovery_dirname;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c8bf405d19d..66d095d7955 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
 	DECODE_HEAD;
 
-	close->cl_stateowner = NULL;
 	READ_BUF(4);
 	READ32(close->cl_seqid);
 	return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
 	DECODE_HEAD;
 
-	lock->lk_replay_owner = NULL;
 	/*
 	* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
 	*/
@@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 {
 	DECODE_HEAD;
 
-	locku->lu_stateowner = NULL;
 	READ_BUF(8);
 	READ32(locku->lu_type);
 	if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
+{
+	__be32 *p;
+	u32 w;
+
+	READ_BUF(4);
+	READ32(w);
+	*x = w;
+	switch (w & NFS4_SHARE_ACCESS_MASK) {
+	case NFS4_SHARE_ACCESS_READ:
+	case NFS4_SHARE_ACCESS_WRITE:
+	case NFS4_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		return nfserr_bad_xdr;
+	}
+	w &= !NFS4_SHARE_ACCESS_MASK;
+	if (!w)
+		return nfs_ok;
+	if (!argp->minorversion)
+		return nfserr_bad_xdr;
+	switch (w & NFS4_SHARE_WANT_MASK) {
+	case NFS4_SHARE_WANT_NO_PREFERENCE:
+	case NFS4_SHARE_WANT_READ_DELEG:
+	case NFS4_SHARE_WANT_WRITE_DELEG:
+	case NFS4_SHARE_WANT_ANY_DELEG:
+	case NFS4_SHARE_WANT_NO_DELEG:
+	case NFS4_SHARE_WANT_CANCEL:
+		break;
+	default:
+		return nfserr_bad_xdr;
+	}
+	w &= ~NFS4_SHARE_WANT_MASK;
+	if (!w)
+		return nfs_ok;
+	switch (w) {
+	case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+	case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+	case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+	      NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+		return nfs_ok;
+	}
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+	__be32 *p;
+
+	READ_BUF(4);
+	READ32(*x);
+	/* Note: unlinke access bits, deny bits may be zero. */
+	if (*x & ~NFS4_SHARE_DENY_BOTH)
+		return nfserr_bad_xdr;
+	return nfs_ok;
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+	__be32 *p;
+
+	READ_BUF(4);
+	READ32(o->len);
+
+	if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+		return nfserr_bad_xdr;
+
+	READ_BUF(o->len);
+	SAVEMEM(o->data, o->len);
+	return nfs_ok;
+xdr_error:
+	return nfserr_bad_xdr;
+}
+
 static __be32
 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 {
@@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 
 	memset(open->op_bmval, 0, sizeof(open->op_bmval));
 	open->op_iattr.ia_valid = 0;
-	open->op_stateowner = NULL;
+	open->op_openowner = NULL;
 
 	/* seqid, share_access, share_deny, clientid, ownerlen */
-	READ_BUF(16 + sizeof(clientid_t));
+	READ_BUF(4);
 	READ32(open->op_seqid);
-	READ32(open->op_share_access);
-	READ32(open->op_share_deny);
+	status = nfsd4_decode_share_access(argp, &open->op_share_access);
+	if (status)
+		goto xdr_error;
+	status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+	if (status)
+		goto xdr_error;
+	READ_BUF(sizeof(clientid_t));
 	COPYMEM(&open->op_clientid, sizeof(clientid_t));
-	READ32(open->op_owner.len);
-
-	/* owner, open_flag */
-	READ_BUF(open->op_owner.len + 4);
-	SAVEMEM(open->op_owner.data, open->op_owner.len);
+	status = nfsd4_decode_opaque(argp, &open->op_owner);
+	if (status)
+		goto xdr_error;
+	READ_BUF(4);
 	READ32(open->op_create);
 	switch (open->op_create) {
 	case NFS4_OPEN_NOCREATE:
@@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
 			return status;
 		break;
+	case NFS4_OPEN_CLAIM_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		if (argp->minorversion < 1)
+			goto xdr_error;
+		/* void */
+		break;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		if (argp->minorversion < 1)
+			goto xdr_error;
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		break;
 	default:
 		goto xdr_error;
 	}
@@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 {
 	DECODE_HEAD;
 		    
-	open_conf->oc_stateowner = NULL;
 	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
 	if (status)
 		return status;
@@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 {
 	DECODE_HEAD;
 		    
-	open_down->od_stateowner = NULL;
 	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
 	if (status)
 		return status;
-	READ_BUF(12);
+	READ_BUF(4);
 	READ32(open_down->od_seqid);
-	READ32(open_down->od_share_access);
-	READ32(open_down->od_share_deny);
-						        
+	status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
+	if (status)
+		return status;
+	status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+	if (status)
+		return status;
 	DECODE_TAIL;
 }
 
@@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 {
 	DECODE_HEAD;
 
-	READ_BUF(12);
+	READ_BUF(8);
 	COPYMEM(setclientid->se_verf.data, 8);
-	READ32(setclientid->se_namelen);
 
-	READ_BUF(setclientid->se_namelen + 8);
-	SAVEMEM(setclientid->se_name, setclientid->se_namelen);
+	status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+	if (status)
+		return nfserr_bad_xdr;
+	READ_BUF(8);
 	READ32(setclientid->se_callback_prog);
 	READ32(setclientid->se_callback_netid_len);
 
@@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 	READ_BUF(NFS4_VERIFIER_SIZE);
 	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
 
-	READ_BUF(4);
-	READ32(exid->clname.len);
-
-	READ_BUF(exid->clname.len);
-	SAVEMEM(exid->clname.data, exid->clname.len);
+	status = nfsd4_decode_opaque(argp, &exid->clname);
+	if (status)
+		return nfserr_bad_xdr;
 
 	READ_BUF(4);
 	READ32(exid->flags);
@@ -1326,6 +1417,16 @@ xdr_error:
 	goto out;
 }
 
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(8);
+	COPYMEM(&dc->clientid, 8);
+
+	DECODE_TAIL;
+}
+
 static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
 {
 	DECODE_HEAD;
@@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_test_stateid,
 	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_destroy_clientid,
 	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 
@@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
  * we know whether the error to be returned is a sequence id mutating error.
  */
 
-#define ENCODE_SEQID_OP_TAIL(stateowner) do {			\
-	if (seqid_mutating_err(nfserr) && stateowner) { 	\
-		stateowner->so_seqid++;				\
-		stateowner->so_replay.rp_status = nfserr;   	\
-		stateowner->so_replay.rp_buflen = 		\
-			  (((char *)(resp)->p - (char *)save)); \
-		memcpy(stateowner->so_replay.rp_buf, save,      \
- 			stateowner->so_replay.rp_buflen); 	\
-	} } while (0);
+static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
+{
+	struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
+
+	if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
+		stateowner->so_seqid++;
+		stateowner->so_replay.rp_status = nfserr;
+		stateowner->so_replay.rp_buflen =
+			  (char *)resp->p - (char *)save;
+		memcpy(stateowner->so_replay.rp_buf, save,
+			stateowner->so_replay.rp_buflen);
+		nfsd4_purge_closed_stateid(stateowner);
+	}
+}
 
 /* Encode as an array of strings the string given with components
  * separated @sep.
@@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
 }
 
 /*
- * Return the path to an export point in the pseudo filesystem namespace
- * Returned string is safe to use as long as the caller holds a reference
- * to @exp.
+ * Encode a path in RFC3530 'pathname4' format
  */
-static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+static __be32 nfsd4_encode_path(const struct path *root,
+		const struct path *path, __be32 **pp, int *buflen)
 {
-	struct svc_fh tmp_fh;
-	char *path = NULL, *rootpath;
-	size_t rootlen;
+	struct path cur = {
+		.mnt = path->mnt,
+		.dentry = path->dentry,
+	};
+	__be32 *p = *pp;
+	struct dentry **components = NULL;
+	unsigned int ncomponents = 0;
+	__be32 err = nfserr_jukebox;
 
-	fh_init(&tmp_fh, NFS4_FHSIZE);
-	*stat = exp_pseudoroot(rqstp, &tmp_fh);
-	if (*stat)
-		return NULL;
-	rootpath = tmp_fh.fh_export->ex_pathname;
+	dprintk("nfsd4_encode_components(");
 
-	path = exp->ex_pathname;
+	path_get(&cur);
+	/* First walk the path up to the nfsd root, and store the
+	 * dentries/path components in an array.
+	 */
+	for (;;) {
+		if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+			break;
+		if (cur.dentry == cur.mnt->mnt_root) {
+			if (follow_up(&cur))
+				continue;
+			goto out_free;
+		}
+		if ((ncomponents & 15) == 0) {
+			struct dentry **new;
+			new = krealloc(components,
+					sizeof(*new) * (ncomponents + 16),
+					GFP_KERNEL);
+			if (!new)
+				goto out_free;
+			components = new;
+		}
+		components[ncomponents++] = cur.dentry;
+		cur.dentry = dget_parent(cur.dentry);
+	}
 
-	rootlen = strlen(rootpath);
-	if (strncmp(path, rootpath, rootlen)) {
-		dprintk("nfsd: fs_locations failed;"
-			"%s is not contained in %s\n", path, rootpath);
-		*stat = nfserr_notsupp;
-		path = NULL;
-		goto out;
+	*buflen -= 4;
+	if (*buflen < 0)
+		goto out_free;
+	WRITE32(ncomponents);
+
+	while (ncomponents) {
+		struct dentry *dentry = components[ncomponents - 1];
+		unsigned int len = dentry->d_name.len;
+
+		*buflen -= 4 + (XDR_QUADLEN(len) << 2);
+		if (*buflen < 0)
+			goto out_free;
+		WRITE32(len);
+		WRITEMEM(dentry->d_name.name, len);
+		dprintk("/%s", dentry->d_name.name);
+		dput(dentry);
+		ncomponents--;
 	}
-	path += rootlen;
-out:
-	fh_put(&tmp_fh);
-	return path;
+
+	*pp = p;
+	err = 0;
+out_free:
+	dprintk(")\n");
+	while (ncomponents)
+		dput(components[--ncomponents]);
+	kfree(components);
+	path_put(&cur);
+	return err;
+}
+
+static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
+		const struct path *path, __be32 **pp, int *buflen)
+{
+	struct svc_export *exp_ps;
+	__be32 res;
+
+	exp_ps = rqst_find_fsidzero_export(rqstp);
+	if (IS_ERR(exp_ps))
+		return nfserrno(PTR_ERR(exp_ps));
+	res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+	exp_put(exp_ps);
+	return res;
 }
 
 /*
@@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
 	int i;
 	__be32 *p = *pp;
 	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-	char *root = nfsd4_path(rqstp, exp, &status);
 
-	if (status)
-		return status;
-	status = nfsd4_encode_components('/', root, &p, buflen);
+	status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
 	if (status)
 		return status;
 	if ((*buflen -= 4) < 0)
@@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
 	return 0;
 }
 
-static u32 nfs4_ftypes[16] = {
-        NF4BAD,  NF4FIFO, NF4CHR, NF4BAD,
-        NF4DIR,  NF4BAD,  NF4BLK, NF4BAD,
-        NF4REG,  NF4BAD,  NF4LNK, NF4BAD,
-        NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
-};
+static u32 nfs4_file_type(umode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFIFO:	return NF4FIFO;
+	case S_IFCHR:	return NF4CHR;
+	case S_IFDIR:	return NF4DIR;
+	case S_IFBLK:	return NF4BLK;
+	case S_IFLNK:	return NF4LNK;
+	case S_IFREG:	return NF4REG;
+	case S_IFSOCK:	return NF4SOCK;
+	default:	return NF4BAD;
+	};
+}
 
 static __be32
 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval0 & FATTR4_WORD0_TYPE) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
-		dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12];
+		dummy = nfs4_file_type(stat.mode);
 		if (dummy == NF4BAD)
 			goto out_serverfault;
 		WRITE32(dummy);
@@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &close->cl_stateid);
 
-	ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 static void
 nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
 {
+	struct xdr_netobj *conf = &ld->ld_owner;
 	__be32 *p;
 
-	RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
+	RESERVE_SPACE(32 + XDR_LEN(conf->len));
 	WRITE64(ld->ld_start);
 	WRITE64(ld->ld_length);
 	WRITE32(ld->ld_type);
-	if (ld->ld_sop) {
+	if (conf->len) {
 		WRITEMEM(&ld->ld_clientid, 8);
-		WRITE32(ld->ld_sop->so_owner.len);
-		WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len);
-		kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner);
+		WRITE32(conf->len);
+		WRITEMEM(conf->data, conf->len);
+		kfree(conf->data);
 	}  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
 		WRITE64((u64)0); /* clientid */
 		WRITE32(0); /* length of owner name */
@@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 	else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
-	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &locku->lu_stateid);
 
-	ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	}
 	/* XXX save filehandle here */
 out:
-	ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
-	ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &od->od_stateid);
 
-	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
 			&maxcount);
 
-	if (nfserr == nfserr_symlink)
-		nfserr = nfserr_inval;
 	if (nfserr)
 		return nfserr;
 	eof = (read->rd_offset + maxcount >=
@@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	    readdir->common.err == nfserr_toosmall &&
 	    readdir->buffer == page) 
 		nfserr = nfserr_toosmall;
-	if (nfserr == nfserr_symlink)
-		nfserr = nfserr_notdir;
 	if (nfserr)
 		goto err_no_verf;
 
@@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
 	WRITE32(seq->seqid);
 	WRITE32(seq->slotid);
-	WRITE32(seq->maxslots);
-	/* For now: target_maxslots = maxslots */
-	WRITE32(seq->maxslots);
+	/* Note slotid's are numbered from zero: */
+	WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
+	WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
 	WRITE32(seq->status_flags);
 
 	ADJUST_ARGS();
@@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
 			  struct nfsd4_test_stateid *test_stateid)
 {
 	struct nfsd4_compoundargs *argp;
+	struct nfs4_client *cl = resp->cstate.session->se_client;
 	stateid_t si;
 	__be32 *p;
 	int i;
@@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
 	nfs4_lock_state();
 	for (i = 0; i < test_stateid->ts_num_ids; i++) {
 		nfsd4_decode_stateid(argp, &si);
-		valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session);
+		valid = nfs4_validate_stateid(cl, &si);
 		RESERVE_SPACE(4);
 		*p++ = htonl(valid);
 		resp->p = p;
@@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 
 /*
  * Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation.
+ * after encoding the current operation with pad.
  *
- * pad: add on 8 bytes for the next operation's op_code and status so that
- * there is room to cache a failure on the next operation.
+ * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
+ *      which was specified at nfsd4_operation, else pad is zero.
  *
- * Compare this length to the session se_fmaxresp_cached.
+ * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
  *
  * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
  * will be at least a page and will therefore hold the xdr_buf head.
  */
-static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
 {
-	int status = 0;
 	struct xdr_buf *xb = &resp->rqstp->rq_res;
-	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
 	struct nfsd4_session *session = NULL;
 	struct nfsd4_slot *slot = resp->cstate.slot;
-	u32 length, tlen = 0, pad = 8;
+	u32 length, tlen = 0;
 
 	if (!nfsd4_has_session(&resp->cstate))
-		return status;
+		return 0;
 
 	session = resp->cstate.session;
-	if (session == NULL || slot->sl_cachethis == 0)
-		return status;
-
-	if (resp->opcnt >= args->opcnt)
-		pad = 0; /* this is the last operation */
+	if (session == NULL)
+		return 0;
 
 	if (xb->page_len == 0) {
 		length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
 	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
 		length, xb->page_len, tlen, pad);
 
-	if (length <= session->se_fchannel.maxresp_cached)
-		return status;
-	else
+	if (length > session->se_fchannel.maxresp_sz)
+		return nfserr_rep_too_big;
+
+	if (slot->sl_cachethis == 1 &&
+	    length > session->se_fchannel.maxresp_cached)
 		return nfserr_rep_too_big_to_cache;
+
+	return 0;
 }
 
 void
@@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	       !nfsd4_enc_ops[op->opnum]);
 	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
 	/* nfsd4_check_drc_limit guarantees enough room for error status */
-	if (!op->status && nfsd4_check_drc_limit(resp))
-		op->status = nfserr_rep_too_big_to_cache;
+	if (!op->status)
+		op->status = nfsd4_check_resp_size(resp, 0);
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c7716143cbd..db34a585e11 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,7 +9,6 @@
 #include <linux/ctype.h>
 
 #include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa242030..58134a23fdf 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
 #include <linux/types.h>
 #include <linux/mount.h>
 
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/msg_prot.h>
+
 #include <linux/nfsd/debug.h>
 #include <linux/nfsd/export.h>
 #include <linux/nfsd/stats.h>
+
 /*
  * nfsd version
  */
 #define NFSD_SUPPORTED_MINOR_VERSION	1
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE       RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2    (8*1024)
+
+
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply.  Used to control buffer sizes.  We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE            ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
 
 struct readdir_cd {
 	__be32			err;	/* 0, nfserr, or nfserr_eof */
@@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
 #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
 	NFSD_WRITEABLE_ATTRS_WORD2
 
+extern int nfsd4_is_junction(struct dentry *dentry);
+#else
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+	return 0;
+}
+
 #endif /* CONFIG_NFSD_V4 */
 
 #endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6d5e0..c763de5c115 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
  * the write call).
  */
 static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
 {
-	/* Type can be negative when creating hardlinks - not to a dir */
-	if (type > 0 && (mode & S_IFMT) != type) {
-		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
-			return nfserr_symlink;
-		else if (type == S_IFDIR)
-			return nfserr_notdir;
-		else if ((mode & S_IFMT) == S_IFDIR)
-			return nfserr_isdir;
-		else
-			return nfserr_inval;
-	}
-	if (type < 0 && (mode & S_IFMT) == -type) {
-		if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
-			return nfserr_symlink;
-		else if (type == -S_IFDIR)
-			return nfserr_isdir;
-		else
-			return nfserr_notdir;
-	}
-	return 0;
+	mode &= S_IFMT;
+
+	if (requested == 0) /* the caller doesn't care */
+		return nfs_ok;
+	if (mode == requested)
+		return nfs_ok;
+	/*
+	 * v4 has an error more specific than err_notdir which we should
+	 * return in preference to err_notdir:
+	 */
+	if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+		return nfserr_symlink;
+	if (requested == S_IFDIR)
+		return nfserr_notdir;
+	if (mode == S_IFDIR)
+		return nfserr_isdir;
+	return nfserr_inval;
 }
 
 static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4eefaf1b42e..a3cf38476a1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
 
+#include <linux/idr.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
@@ -45,24 +46,20 @@ typedef struct {
 } clientid_t;
 
 typedef struct {
-	u32             so_boot;
-	u32             so_stateownerid;
-	u32             so_fileid;
+	clientid_t	so_clid;
+	u32		so_id;
 } stateid_opaque_t;
 
 typedef struct {
 	u32                     si_generation;
 	stateid_opaque_t        si_opaque;
 } stateid_t;
-#define si_boot           si_opaque.so_boot
-#define si_stateownerid   si_opaque.so_stateownerid
-#define si_fileid         si_opaque.so_fileid
 
 #define STATEID_FMT	"(%08x/%08x/%08x/%08x)"
 #define STATEID_VAL(s) \
-	(s)->si_boot, \
-	(s)->si_stateownerid, \
-	(s)->si_fileid, \
+	(s)->si_opaque.so_clid.cl_boot, \
+	(s)->si_opaque.so_clid.cl_id, \
+	(s)->si_opaque.so_id, \
 	(s)->si_generation
 
 struct nfsd4_callback {
@@ -76,17 +73,27 @@ struct nfsd4_callback {
 	bool cb_done;
 };
 
+struct nfs4_stid {
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+	unsigned char sc_type;
+	stateid_t sc_stateid;
+	struct nfs4_client *sc_client;
+};
+
 struct nfs4_delegation {
+	struct nfs4_stid	dl_stid; /* must be first field */
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
 	struct list_head	dl_recall_lru;  /* delegation recalled */
 	atomic_t		dl_count;       /* ref count */
-	struct nfs4_client	*dl_client;
 	struct nfs4_file	*dl_file;
 	u32			dl_type;
 	time_t			dl_time;
 /* For recall: */
-	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
 	int			dl_retries;
 	struct nfsd4_callback	dl_recall;
@@ -104,6 +111,11 @@ struct nfs4_cb_conn {
 	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
 };
 
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_delegation, dl_stid);
+}
+
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
 /* Maximum number of operations per session compound */
@@ -220,6 +232,7 @@ struct nfs4_client {
 	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */
 	struct list_head	cl_strhash; 	/* hash by cl_name */
 	struct list_head	cl_openowners;
+	struct idr		cl_stateids;	/* stateid lookup */
 	struct list_head	cl_delegations;
 	struct list_head        cl_lru;         /* tail queue */
 	struct xdr_netobj	cl_name; 	/* id generated by client */
@@ -245,6 +258,7 @@ struct nfs4_client {
 #define NFSD4_CB_UP		0
 #define NFSD4_CB_UNKNOWN	1
 #define NFSD4_CB_DOWN		2
+#define NFSD4_CB_FAULT		3
 	int			cl_cb_state;
 	struct nfsd4_callback	cl_cb_null;
 	struct nfsd4_session	*cl_cb_session;
@@ -293,6 +307,9 @@ static inline void
 update_stateid(stateid_t *stateid)
 {
 	stateid->si_generation++;
+	/* Wraparound recommendation from 3530bis-13 9.1.3.2: */
+	if (stateid->si_generation == 0)
+		stateid->si_generation = 1;
 }
 
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
@@ -312,49 +329,57 @@ struct nfs4_replay {
 	__be32			rp_status;
 	unsigned int		rp_buflen;
 	char			*rp_buf;
-	unsigned		intrp_allocated;
 	struct knfsd_fh		rp_openfh;
 	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
 
-/*
-* nfs4_stateowner can either be an open_owner, or a lock_owner
-*
-*    so_idhash:  stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
-*         for lock_owner
-*    so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
-*         for lock_owner
-*    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
-*         struct is reaped.
-*    so_perfilestate: heads the list of nfs4_stateid (either open or lock) 
-*         and is used to ensure no dangling nfs4_stateid references when we 
-*         release a stateowner.
-*    so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
-*         close is called to reap associated byte-range locks
-*    so_close_lru: (open) stateowner is placed on this list instead of being
-*         reaped (when so_perfilestate is empty) to hold the last close replay.
-*         reaped by laundramat thread after lease period.
-*/
 struct nfs4_stateowner {
-	struct kref		so_ref;
-	struct list_head        so_idhash;   /* hash by so_id */
 	struct list_head        so_strhash;   /* hash by op_name */
-	struct list_head        so_perclient;
 	struct list_head        so_stateids;
-	struct list_head        so_perstateid; /* for lockowners only */
-	struct list_head	so_close_lru; /* tail queue */
-	time_t			so_time; /* time of placement on so_close_lru */
-	int			so_is_open_owner; /* 1=openowner,0=lockowner */
-	u32                     so_id;
 	struct nfs4_client *    so_client;
 	/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
 	 * sequence id expected from the client: */
 	u32                     so_seqid;
 	struct xdr_netobj       so_owner;     /* open owner name */
-	int                     so_confirmed; /* successful OPEN_CONFIRM? */
 	struct nfs4_replay	so_replay;
+	bool			so_is_open_owner;
 };
 
+struct nfs4_openowner {
+	struct nfs4_stateowner	oo_owner; /* must be first field */
+	struct list_head        oo_perclient;
+	/*
+	 * We keep around openowners a little while after last close,
+	 * which saves clients from having to confirm, and allows us to
+	 * handle close replays if they come soon enough.  The close_lru
+	 * is a list of such openowners, to be reaped by the laundromat
+	 * thread eventually if they remain unused:
+	 */
+	struct list_head	oo_close_lru;
+	struct nfs4_ol_stateid *oo_last_closed_stid;
+	time_t			oo_time; /* time of placement on so_close_lru */
+#define NFS4_OO_CONFIRMED   1
+#define NFS4_OO_PURGE_CLOSE 2
+#define NFS4_OO_NEW         4
+	unsigned char		oo_flags;
+};
+
+struct nfs4_lockowner {
+	struct nfs4_stateowner	lo_owner; /* must be first element */
+	struct list_head        lo_perstateid; /* for lockowners only */
+	struct list_head	lo_list; /* for temporary uses */
+};
+
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+	return container_of(so, struct nfs4_openowner, oo_owner);
+}
+
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+	return container_of(so, struct nfs4_lockowner, lo_owner);
+}
+
 /*
 *  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
 *    o fi_perfile list is used to search for conflicting 
@@ -368,17 +393,17 @@ struct nfs4_file {
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
 	struct file *		fi_fds[3];
 	/*
-	 * Each open or lock stateid contributes 1 to either
-	 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
-	 * on open or lock mode:
+	 * Each open or lock stateid contributes 0-4 to the counts
+	 * below depending on which bits are set in st_access_bitmap:
+	 *     1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+	 *   + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+	 *   + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
 	 */
 	atomic_t		fi_access[2];
 	struct file		*fi_deleg_file;
 	struct file_lock	*fi_lease;
 	atomic_t		fi_delegees;
 	struct inode		*fi_inode;
-	u32                     fi_id;      /* used with stateowner->so_id 
-					     * for stateid_hashtbl hash */
 	bool			fi_had_conflict;
 };
 
@@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 		return f->fi_fds[O_RDONLY];
 }
 
-/*
-* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
-*
-* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
-*
-* 	st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
-* 	st_perfile: file_hashtbl[] entry.
-* 	st_perfile_state: nfs4_stateowner->so_perfilestate
-*       st_perlockowner: (open stateid) list of lock nfs4_stateowners
-* 	st_access_bmap: used only for open stateid
-* 	st_deny_bmap: used only for open stateid
-*	st_openstp: open stateid lock stateid was derived from
-*
-* XXX: open stateids and lock stateids have diverged sufficiently that
-* we should consider defining separate structs for the two cases.
-*/
-
-struct nfs4_stateid {
-	struct list_head              st_hash; 
+/* "ol" stands for "Open or Lock".  Better suggestions welcome. */
+struct nfs4_ol_stateid {
+	struct nfs4_stid    st_stid; /* must be first field */
 	struct list_head              st_perfile;
 	struct list_head              st_perstateowner;
 	struct list_head              st_lockowners;
 	struct nfs4_stateowner      * st_stateowner;
 	struct nfs4_file            * st_file;
-	stateid_t                     st_stateid;
 	unsigned long                 st_access_bmap;
 	unsigned long                 st_deny_bmap;
-	struct nfs4_stateid         * st_openstp;
+	struct nfs4_ol_stateid         * st_openstp;
 };
 
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
+
 /* flags for preprocess_seqid_op() */
-#define HAS_SESSION             0x00000001
-#define CONFIRM                 0x00000002
-#define OPEN_STATE              0x00000004
-#define LOCK_STATE              0x00000008
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
-#define CLOSE_STATE             0x00000040
-
-#define seqid_mutating_err(err)                       \
-	(((err) != nfserr_stale_clientid) &&    \
-	((err) != nfserr_bad_seqid) &&          \
-	((err) != nfserr_stale_stateid) &&      \
-	((err) != nfserr_bad_stateid))
 
 struct nfsd4_compound_state;
 
@@ -461,7 +463,8 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void nfs4_free_stateowner(struct kref *kref);
+extern void nfs4_free_openowner(struct nfs4_openowner *);
+extern void nfs4_free_lockowner(struct nfs4_lockowner *);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern void nfsd4_init_recdir(char *recdir_name);
+extern void nfsd4_init_recdir(void);
 extern int nfsd4_recdir_load(void);
 extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
@@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
-extern __be32 nfs4_validate_stateid(stateid_t *, int);
-
-static inline void
-nfs4_put_stateowner(struct nfs4_stateowner *so)
-{
-	kref_put(&so->so_ref, nfs4_free_stateowner);
-}
-
-static inline void
-nfs4_get_stateowner(struct nfs4_stateowner *so)
-{
-	kref_get(&so->so_ref);
-}
+extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
+extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
 
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd0acca5370..7a2e442623c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
 {
 	if (d_mountpoint(dentry))
 		return 1;
+	if (nfsd4_is_junction(dentry))
+		return 1;
 	if (!(exp->ex_flags & NFSEXP_V4ROOT))
 		return 0;
 	return dentry->d_inode != NULL;
@@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	unsigned int flags = 0;
 
 	/* Get inode */
-	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
 	if (error)
 		return error;
 
@@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
 	return error;
 }
 
+#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
+#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+int nfsd4_is_junction(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode == NULL)
+		return 0;
+	if (inode->i_mode & S_IXUGO)
+		return 0;
+	if (!(inode->i_mode & S_ISVTX))
+		return 0;
+	if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+		return 0;
+	return 1;
+}
 #endif /* defined(CONFIG_NFSD_V4) */
 
 #ifdef CONFIG_NFSD_V3
@@ -1352,7 +1370,7 @@ __be32
 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		char *fname, int flen, struct iattr *iap,
 		struct svc_fh *resfhp, int createmode, u32 *verifier,
-	        int *truncp, int *created)
+	        bool *truncp, bool *created)
 {
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
@@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
-	err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
+	err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
 	if (err)
 		goto out;
-
+	err = nfserr_isdir;
+	if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+		goto out;
 	err = nfserr_perm;
 	if (!len)
 		goto out;
@@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
-	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
+	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
 		err = inode_permission(inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e0bbac04d1d..3f54ad03bb2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,21 +10,22 @@
 /*
  * Flags for nfsd_permission
  */
-#define NFSD_MAY_NOP		0
-#define NFSD_MAY_EXEC		1 /* == MAY_EXEC */
-#define NFSD_MAY_WRITE		2 /* == MAY_WRITE */
-#define NFSD_MAY_READ		4 /* == MAY_READ */
-#define NFSD_MAY_SATTR		8
-#define NFSD_MAY_TRUNC		16
-#define NFSD_MAY_LOCK		32
-#define NFSD_MAY_MASK		63
+#define NFSD_MAY_NOP			0
+#define NFSD_MAY_EXEC			0x001 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE			0x002 /* == MAY_WRITE */
+#define NFSD_MAY_READ			0x004 /* == MAY_READ */
+#define NFSD_MAY_SATTR			0x008
+#define NFSD_MAY_TRUNC			0x010
+#define NFSD_MAY_LOCK			0x020
+#define NFSD_MAY_MASK			0x03f
 
 /* extra hints to permission and open routines: */
-#define NFSD_MAY_OWNER_OVERRIDE	64
-#define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
-#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
-#define NFSD_MAY_NOT_BREAK_LEASE 512
-#define NFSD_MAY_BYPASS_GSS	1024
+#define NFSD_MAY_OWNER_OVERRIDE		0x040
+#define NFSD_MAY_LOCAL_ACCESS		0x080 /* for device special files */
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT	0x100
+#define NFSD_MAY_NOT_BREAK_LEASE	0x200
+#define NFSD_MAY_BYPASS_GSS		0x400
+#define NFSD_MAY_READ_IF_EXEC		0x800
 
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -61,7 +62,7 @@ __be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
 __be32		do_nfsd_create(struct svc_rqst *, struct svc_fh *,
 				char *name, int len, struct iattr *attrs,
 				struct svc_fh *res, int createmode,
-				u32 *verifier, int *truncp, int *created);
+				u32 *verifier, bool *truncp, bool *created);
 __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 				loff_t, unsigned long);
 #endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d2a8d04428c..2364747ee97 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
 struct nfsd4_close {
 	u32		cl_seqid;           /* request */
 	stateid_t	cl_stateid;         /* request+response */
-	struct nfs4_stateowner * cl_stateowner;	/* response */
 };
 
 struct nfsd4_commit {
@@ -131,7 +130,7 @@ struct nfsd4_link {
 
 struct nfsd4_lock_denied {
 	clientid_t	ld_clientid;
-	struct nfs4_stateowner   *ld_sop;
+	struct xdr_netobj	ld_owner;
 	u64             ld_start;
 	u64             ld_length;
 	u32             ld_type;
@@ -165,9 +164,6 @@ struct nfsd4_lock {
 		} ok;
 		struct nfsd4_lock_denied        denied;
 	} u;
-	/* The lk_replay_owner is the open owner in the open_to_lock_owner
-	 * case and the lock owner otherwise: */
-	struct nfs4_stateowner *lk_replay_owner;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
@@ -188,7 +184,6 @@ struct nfsd4_lockt {
 	struct xdr_netobj		lt_owner;
 	u64				lt_offset;
 	u64				lt_length;
-	struct nfs4_stateowner * 	lt_stateowner;
 	struct nfsd4_lock_denied  	lt_denied;
 };
 
@@ -199,7 +194,6 @@ struct nfsd4_locku {
 	stateid_t       lu_stateid;
 	u64             lu_offset;
 	u64             lu_length;
-	struct nfs4_stateowner  *lu_stateowner;
 };
 
 
@@ -232,8 +226,11 @@ struct nfsd4_open {
 	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
-	int		op_truncate;        /* used during processing */
-	struct nfs4_stateowner *op_stateowner; /* used during processing */
+	bool		op_truncate;        /* used during processing */
+	bool		op_created;         /* used during processing */
+	struct nfs4_openowner *op_openowner; /* used during processing */
+	struct nfs4_file *op_file;          /* used during processing */
+	struct nfs4_ol_stateid *op_stp;	    /* used during processing */
 	struct nfs4_acl *op_acl;
 };
 #define op_iattr	iattr
@@ -243,7 +240,6 @@ struct nfsd4_open_confirm {
 	stateid_t	oc_req_stateid		/* request */;
 	u32		oc_seqid    		/* request */;
 	stateid_t	oc_resp_stateid		/* response */;
-	struct nfs4_stateowner * oc_stateowner;	/* response */
 };
 
 struct nfsd4_open_downgrade {
@@ -251,7 +247,6 @@ struct nfsd4_open_downgrade {
 	u32             od_seqid;
 	u32             od_share_access;
 	u32             od_share_deny;
-	struct nfs4_stateowner *od_stateowner;
 };
 
 
@@ -325,8 +320,7 @@ struct nfsd4_setattr {
 
 struct nfsd4_setclientid {
 	nfs4_verifier	se_verf;            /* request */
-	u32		se_namelen;         /* request */
-	char *		se_name;            /* request */
+	struct xdr_netobj se_name;
 	u32		se_callback_prog;   /* request */
 	u32		se_callback_netid_len;  /* request */
 	char *		se_callback_netid_val;  /* request */
@@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs {
 
 struct nfsd4_test_stateid {
 	__be32		ts_num_ids;
-	__be32		ts_has_session;
 	struct nfsd4_compoundargs *ts_saved_args;
 	struct nfsd4_saved_compoundargs ts_savedp;
 };
@@ -405,6 +398,10 @@ struct nfsd4_destroy_session {
 	struct nfs4_sessionid	sessionid;
 };
 
+struct nfsd4_destroy_clientid {
+	clientid_t clientid;
+};
+
 struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
@@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
 		struct nfsd4_compoundargs *);
 int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
 		struct nfsd4_compoundres *);
+int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
 void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
@@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_destroy_session *);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
 __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
 		struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
 		struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
 extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
 extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 81ecf9c0bf0..194fb22ef79 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,20 +7185,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 {
 	int ret = 0;
 	struct buffer_head *dir_bh = NULL;
-	struct ocfs2_security_xattr_info si = {
-		.enable = 1,
-	};
 
-	ret = ocfs2_init_security_get(inode, dir, qstr, &si);
+	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
 	if (!ret) {
-		ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
-				      si.name, si.value, si.value_len,
-				      XATTR_CREATE);
-		if (ret) {
-			mlog_errno(ret);
-			goto leave;
-		}
-	} else if (ret != -EOPNOTSUPP) {
 		mlog_errno(ret);
 		goto leave;
 	}
@@ -7255,6 +7244,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
 			       name, value, size, flags);
 }
 
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		     void *fs_info)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+				      xattr->name, xattr->value,
+				      xattr->value_len, XATTR_CREATE);
+		if (err)
+			break;
+	}
+	return err;
+}
+
 int ocfs2_init_security_get(struct inode *inode,
 			    struct inode *dir,
 			    const struct qstr *qstr,
@@ -7263,8 +7268,13 @@ int ocfs2_init_security_get(struct inode *inode,
 	/* check whether ocfs2 support feature xattr */
 	if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
 		return -EOPNOTSUPP;
-	return security_inode_init_security(inode, dir, qstr, &si->name,
-					    &si->value, &si->value_len);
+	if (si)
+		return security_old_inode_init_security(inode, dir, qstr,
+							&si->name, &si->value,
+							&si->value_len);
+
+	return security_inode_init_security(inode, dir, qstr,
+					    &ocfs2_initxattrs, NULL);
 }
 
 int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b654a1b..42b274da92c 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
 #include <linux/time.h>
 #include <linux/irqnr.h>
 #include <asm/cputime.h>
+#include <linux/tick.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
 #define arch_idle_time(cpu) 0
 #endif
 
+static cputime64_t get_idle_time(int cpu)
+{
+	u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
+	cputime64_t idle;
+
+	if (idle_time == -1ULL) {
+		/* !NO_HZ so we can rely on cpustat.idle */
+		idle = kstat_cpu(cpu).cpustat.idle;
+		idle = cputime64_add(idle, arch_idle_time(cpu));
+	} else
+		idle = usecs_to_cputime(idle_time);
+
+	return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+	u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+	cputime64_t iowait;
+
+	if (iowait_time == -1ULL)
+		/* !NO_HZ so we can rely on cpustat.iowait */
+		iowait = kstat_cpu(cpu).cpustat.iowait;
+	else
+		iowait = usecs_to_cputime(iowait_time);
+
+	return iowait;
+}
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v)
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
-		idle = cputime64_add(idle, arch_idle_time(i));
-		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+		idle = cputime64_add(idle, get_idle_time(i));
+		iowait = cputime64_add(iowait, get_iowait_time(i));
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(guest),
 		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
-
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
 		user = kstat_cpu(i).cpustat.user;
 		nice = kstat_cpu(i).cpustat.nice;
 		system = kstat_cpu(i).cpustat.system;
-		idle = kstat_cpu(i).cpustat.idle;
-		idle = cputime64_add(idle, arch_idle_time(i));
-		iowait = kstat_cpu(i).cpustat.iowait;
+		idle = get_idle_time(i);
+		iowait = get_iowait_time(i);
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a159ba5a35e..eb711060a6f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
 	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 		jb = jb_array + i;
 		jb->journal_list = NULL;
-		jb->bitmaps = vmalloc(mem);
+		jb->bitmaps = vzalloc(mem);
 		if (!jb->bitmaps) {
 			reiserfs_warning(sb, "clm-2000", "unable to "
 					 "allocate bitmaps for journal lists");
 			failed = 1;
 			break;
 		}
-		memset(jb->bitmaps, 0, mem);
 	}
 	if (failed) {
 		free_list_bitmaps(sb, jb_array);
@@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
 	if (num_cnodes <= 0) {
 		return NULL;
 	}
-	head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+	head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
 	if (!head) {
 		return NULL;
 	}
-	memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
 	head[0].prev = NULL;
 	head[0].next = head + 1;
 	for (i = 1; i < num_cnodes; i++) {
@@ -2685,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	 * dependency inversion warnings.
 	 */
 	reiserfs_write_unlock(sb);
-	journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
+	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
 	if (!journal) {
 		reiserfs_warning(sb, "journal-1256",
 				 "unable to get memory for journal structure");
 		reiserfs_write_lock(sb);
 		return 1;
 	}
-	memset(journal, 0, sizeof(struct reiserfs_journal));
 	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
 	INIT_LIST_HEAD(&journal->j_prealloc_list);
 	INIT_LIST_HEAD(&journal->j_working_list);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index b6b9b1fe33b..7483279b482 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 		/* allocate additional bitmap blocks, reallocate array of bitmap
 		 * block pointers */
 		bitmap =
-		    vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+		    vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
 		if (!bitmap) {
 			/* Journal bitmaps are still supersized, but the memory isn't
 			 * leaked, so I guess it's ok */
 			printk("reiserfs_resize: unable to allocate memory.\n");
 			return -ENOMEM;
 		}
-		memset(bitmap, 0,
-		       sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
 		for (i = 0; i < bmap_nr; i++)
 			bitmap[i] = old_bitmap[i];
 
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ef66c18a933..534668fa41b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 	if (IS_PRIVATE(dir))
 		return 0;
 
-	error = security_inode_init_security(inode, dir, qstr, &sec->name,
-					     &sec->value, &sec->length);
+	error = security_old_inode_init_security(inode, dir, qstr, &sec->name,
+						 &sec->value, &sec->length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
 			error = 0;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1360d4f88f4..048b59d5b2f 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -19,9 +19,9 @@ config SQUASHFS
 
 	  If you want to compile this as a module ( = code which can be
 	  inserted in and removed from the running kernel whenever you want),
-	  say M here and read <file:Documentation/modules.txt>.  The module
-	  will be called squashfs.  Note that the root file system (the one
-	  containing the directory /) cannot be compiled as a module.
+	  say M here.  The module will be called squashfs.  Note that the root
+	  file system (the one containing the directory /) cannot be compiled
+	  as a module.
 
 	  If unsure, say N.
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d..48ffbdf0d01 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida);
 static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent **pos;
 
-	BUG_ON(sd->s_sibling);
-
-	/* Store directory entries in order by ino.  This allows
-	 * readdir to properly restart without having to add a
-	 * cursor into the s_dir.children list.
-	 */
-	for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
-		if (sd->s_ino < (*pos)->s_ino)
-			break;
+	struct rb_node **p;
+	struct rb_node *parent;
+
+	if (sysfs_type(sd) == SYSFS_DIR)
+		parent_sd->s_dir.subdirs++;
+
+	p = &parent_sd->s_dir.inode_tree.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+#define node	rb_entry(parent, struct sysfs_dirent, inode_node)
+		if (sd->s_ino < node->s_ino) {
+			p = &node->inode_node.rb_left;
+		} else if (sd->s_ino > node->s_ino) {
+			p = &node->inode_node.rb_right;
+		} else {
+			printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
+			       (unsigned long) sd->s_ino);
+			BUG();
+		}
+#undef node
 	}
-	sd->s_sibling = *pos;
-	*pos = sd;
+	rb_link_node(&sd->inode_node, parent, p);
+	rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
+
+	p = &parent_sd->s_dir.name_tree.rb_node;
+	parent = NULL;
+	while (*p) {
+		int c;
+		parent = *p;
+#define node	rb_entry(parent, struct sysfs_dirent, name_node)
+		c = strcmp(sd->s_name, node->s_name);
+		if (c < 0) {
+			p = &node->name_node.rb_left;
+		} else {
+			p = &node->name_node.rb_right;
+		}
+#undef node
+	}
+	rb_link_node(&sd->name_node, parent, p);
+	rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
 }
 
 /**
@@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
  */
 static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
-	struct sysfs_dirent **pos;
+	if (sysfs_type(sd) == SYSFS_DIR)
+		sd->s_parent->s_dir.subdirs--;
 
-	for (pos = &sd->s_parent->s_dir.children; *pos;
-	     pos = &(*pos)->s_sibling) {
-		if (*pos == sd) {
-			*pos = sd->s_sibling;
-			sd->s_sibling = NULL;
-			break;
-		}
-	}
+	rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
+	rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
 }
 
 /**
@@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
  */
 void sysfs_put_active(struct sysfs_dirent *sd)
 {
-	struct completion *cmpl;
 	int v;
 
 	if (unlikely(!sd))
@@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd)
 		return;
 
 	/* atomic_dec_return() is a mb(), we'll always see the updated
-	 * sd->s_sibling.
+	 * sd->u.completion.
 	 */
-	cmpl = (void *)sd->s_sibling;
-	complete(cmpl);
+	complete(sd->u.completion);
 }
 
 /**
@@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
 
-	BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
 
 	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
 		return;
 
-	sd->s_sibling = (void *)&wait;
+	sd->u.completion = (void *)&wait;
 
 	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
 	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated sd->s_sibling.
+	 * the updated sd->u.completion.
 	 */
 	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
 
@@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
 		wait_for_completion(&wait);
 	}
 
-	sd->s_sibling = NULL;
-
 	lock_acquired(&sd->dep_map, _RET_IP_);
 	rwsem_release(&sd->dep_map, 1, _RET_IP_);
 }
@@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
+	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
+			acxt->parent_sd->s_name, sd->s_name);
+		return -EINVAL;
+	}
+
 	if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
 		return -EEXIST;
 
@@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	}
 
 	sd->s_flags |= SYSFS_FLAG_REMOVED;
-	sd->s_sibling = acxt->removed;
+	sd->u.removed_list = acxt->removed;
 	acxt->removed = sd;
 }
 
@@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 	while (acxt->removed) {
 		struct sysfs_dirent *sd = acxt->removed;
 
-		acxt->removed = sd->s_sibling;
-		sd->s_sibling = NULL;
+		acxt->removed = sd->u.removed_list;
 
 		sysfs_deactivate(sd);
 		unmap_bin_file(sd);
@@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const void *ns,
 				       const unsigned char *name)
 {
-	struct sysfs_dirent *sd;
+	struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
+	struct sysfs_dirent *found = NULL;
 
-	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
-		if (ns && sd->s_ns && (sd->s_ns != ns))
-			continue;
-		if (!strcmp(sd->s_name, name))
-			return sd;
+	if (!!sysfs_ns_type(parent_sd) != !!ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(parent_sd)? "required": "invalid",
+			parent_sd->s_name, name);
+		return NULL;
 	}
-	return NULL;
+
+	while (p) {
+		int c;
+#define node	rb_entry(p, struct sysfs_dirent, name_node)
+		c = strcmp(name, node->s_name);
+		if (c < 0) {
+			p = node->name_node.rb_left;
+		} else if (c > 0) {
+			p = node->name_node.rb_right;
+		} else {
+			found = node;
+			p = node->name_node.rb_left;
+		}
+#undef node
+	}
+
+	if (found) {
+		while (found->s_ns != ns) {
+			p = rb_next(&found->name_node);
+			if (!p)
+				return NULL;
+			found = rb_entry(p, struct sysfs_dirent, name_node);
+			if (strcmp(name, found->s_name))
+				return NULL;
+		}
+	}
+
+	return found;
 }
 
 /**
@@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
 static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent **pos;
+	struct rb_node *pos;
 
 	if (!dir_sd)
 		return;
 
 	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
 	sysfs_addrm_start(&acxt, dir_sd);
-	pos = &dir_sd->s_dir.children;
-	while (*pos) {
-		struct sysfs_dirent *sd = *pos;
-
+	pos = rb_first(&dir_sd->s_dir.inode_tree);
+	while (pos) {
+		struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
+		pos = rb_next(pos);
 		if (sysfs_type(sd) != SYSFS_DIR)
 			sysfs_remove_one(&acxt, sd);
-		else
-			pos = &(*pos)->s_sibling;
 	}
 	sysfs_addrm_finish(&acxt);
 
@@ -881,12 +932,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 			pos = NULL;
 	}
 	if (!pos && (ino > 1) && (ino < INT_MAX)) {
-		pos = parent_sd->s_dir.children;
-		while (pos && (ino > pos->s_ino))
-			pos = pos->s_sibling;
+		struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
+		while (p) {
+#define node	rb_entry(p, struct sysfs_dirent, inode_node)
+			if (ino < node->s_ino) {
+				pos = node;
+				p = node->inode_node.rb_left;
+			} else if (ino > node->s_ino) {
+				p = node->inode_node.rb_right;
+			} else {
+				pos = node;
+				break;
+			}
+#undef node
+		}
+	}
+	while (pos && pos->s_ns != ns) {
+		struct rb_node *p = rb_next(&pos->inode_node);
+		if (!p)
+			pos = NULL;
+		else
+			pos = rb_entry(p, struct sysfs_dirent, inode_node);
 	}
-	while (pos && pos->s_ns && pos->s_ns != ns)
-		pos = pos->s_sibling;
 	return pos;
 }
 
@@ -894,10 +961,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
 {
 	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos)
-		pos = pos->s_sibling;
-	while (pos && pos->s_ns && pos->s_ns != ns)
-		pos = pos->s_sibling;
+	if (pos) do {
+		struct rb_node *p = rb_next(&pos->inode_node);
+		if (!p)
+			pos = NULL;
+		else
+			pos = rb_entry(p, struct sysfs_dirent, inode_node);
+	} while (pos && pos->s_ns != ns);
 	return pos;
 }
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1ad8c93c1b8..d4e6080b4b2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	mutex_lock(&sysfs_mutex);
 
 	if (sd && dir)
-		/* Only directories are tagged, so no need to pass
-		 * a tag explicitly.
-		 */
 		sd = sysfs_find_dirent(sd, NULL, dir);
 	if (sd && attr)
 		sd = sysfs_find_dirent(sd, NULL, attr);
@@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = {
 	.poll		= sysfs_poll,
 };
 
+int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+		  const void **pns)
+{
+	struct sysfs_dirent *dir_sd = kobj->sd;
+	const struct sysfs_ops *ops;
+	const void *ns = NULL;
+	int err;
+
+	err = 0;
+	if (!sysfs_ns_type(dir_sd))
+		goto out;
+
+	err = -EINVAL;
+	if (!kobj->ktype)
+		goto out;
+	ops = kobj->ktype->sysfs_ops;
+	if (!ops)
+		goto out;
+	if (!ops->namespace)
+		goto out;
+
+	err = 0;
+	ns = ops->namespace(kobj, attr);
+out:
+	if (err) {
+		WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
+		     "kobject: %s\n", kobject_name(kobj));
+	}
+	*pns = ns;
+	return err;
+}
+
 int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
 			const struct attribute *attr, int type, mode_t amode)
 {
 	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
+	const void *ns;
 	int rc;
 
+	rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
+	if (rc)
+		return rc;
+
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
 		return -ENOMEM;
+
+	sd->s_ns = ns;
 	sd->s_attr.attr = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
 
@@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
+	const void *ns;
 	int rc;
 
+	rc = sysfs_attr_ns(kobj, attr, &ns);
+	if (rc)
+		return rc;
+
 	mutex_lock(&sysfs_mutex);
 
 	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
+	sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
 	if (!sd)
 		goto out;
 
@@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-	sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
+	const void *ns;
+
+	if (sysfs_attr_ns(kobj, attr, &ns))
+		return;
+
+	sysfs_hash_and_remove(kobj->sd, ns, attr->name);
 }
 
 void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c7..e23f28894a3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 	inode->i_ctime = iattr->ia_ctime;
 }
 
-static int sysfs_count_nlink(struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *child;
-	int nr = 0;
-
-	for (child = sd->s_dir.children; child; child = child->s_sibling)
-		if (sysfs_type(child) == SYSFS_DIR)
-			nr++;
-
-	return nr + 2;
-}
-
 static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 	}
 
 	if (sysfs_type(sd) == SYSFS_DIR)
-		inode->i_nlink = sysfs_count_nlink(sd);
+		inode->i_nlink = sd->s_dir.subdirs + 2;
 }
 
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 	sysfs_addrm_start(&acxt, dir_sd);
 
 	sd = sysfs_find_dirent(dir_sd, ns, name);
-	if (sd && (sd->s_ns != ns))
-		sd = NULL;
 	if (sd)
 		sysfs_remove_one(&acxt, sd);
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 845ab3ad229..ce29e28b766 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,14 +11,18 @@
 #include <linux/lockdep.h>
 #include <linux/kobject_ns.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 
 struct sysfs_open_dirent;
 
 /* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
 	struct kobject		*kobj;
-	/* children list starts here and goes through sd->s_sibling */
-	struct sysfs_dirent	*children;
+
+	unsigned long		subdirs;
+
+	struct rb_root		inode_tree;
+	struct rb_root		name_tree;
 };
 
 struct sysfs_elem_symlink {
@@ -56,9 +60,16 @@ struct sysfs_dirent {
 	struct lockdep_map	dep_map;
 #endif
 	struct sysfs_dirent	*s_parent;
-	struct sysfs_dirent	*s_sibling;
 	const char		*s_name;
 
+	struct rb_node		inode_node;
+	struct rb_node		name_node;
+
+	union {
+		struct completion	*completion;
+		struct sysfs_dirent	*removed_list;
+	} u;
+
 	const void		*s_ns; /* namespace tag */
 	union {
 		struct sysfs_elem_dir		s_dir;
diff --git a/fs/xattr.c b/fs/xattr.c
index f060663ab70..67583de8218 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/evm.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/fsnotify.h>
@@ -166,6 +167,64 @@ out_noalloc:
 }
 EXPORT_SYMBOL_GPL(xattr_getsecurity);
 
+/*
+ * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
+ *
+ * Allocate memory, if not already allocated, or re-allocate correct size,
+ * before retrieving the extended attribute.
+ *
+ * Returns the result of alloc, if failed, or the getxattr operation.
+ */
+ssize_t
+vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
+		   size_t xattr_size, gfp_t flags)
+{
+	struct inode *inode = dentry->d_inode;
+	char *value = *xattr_value;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_READ);
+	if (error)
+		return error;
+
+	if (!inode->i_op->getxattr)
+		return -EOPNOTSUPP;
+
+	error = inode->i_op->getxattr(dentry, name, NULL, 0);
+	if (error < 0)
+		return error;
+
+	if (!value || (error > xattr_size)) {
+		value = krealloc(*xattr_value, error + 1, flags);
+		if (!value)
+			return -ENOMEM;
+		memset(value, 0, error + 1);
+	}
+
+	error = inode->i_op->getxattr(dentry, name, value, error);
+	*xattr_value = value;
+	return error;
+}
+
+/* Compare an extended attribute value with the given value */
+int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
+		  const char *value, size_t size, gfp_t flags)
+{
+	char *xattr_value = NULL;
+	int rc;
+
+	rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
+	if (rc < 0)
+		return rc;
+
+	if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
+		rc = -EINVAL;
+	else
+		rc = 0;
+	kfree(xattr_value);
+	return rc;
+}
+
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
 {
@@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 	error = inode->i_op->removexattr(dentry, name);
 	mutex_unlock(&inode->i_mutex);
 
-	if (!error)
+	if (!error) {
 		fsnotify_xattr(dentry);
+		evm_inode_post_removexattr(dentry, name);
+	}
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f7c8f7a9ea6..292eff19803 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -61,12 +61,7 @@ extern void  kmem_free(const void *);
 
 static inline void *kmem_zalloc_large(size_t size)
 {
-	void *ptr;
-
-	ptr = vmalloc(size);
-	if (ptr)
-		memset(ptr, 0, size);
-	return ptr;
+	return vzalloc(size);
 }
 static inline void kmem_free_large(void *ptr)
 {
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdd9cb54d63..ce84ffd0264 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -452,7 +452,7 @@ xfs_alloc_read_agfl(
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+	xfs_buf_set_ref(bp, XFS_AGFL_REF);
 	*bpp = bp;
 	return 0;
 }
@@ -2139,7 +2139,7 @@ xfs_read_agf(
 		xfs_trans_brelse(tp, *bpp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
-	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8c37dde4c52..11b2aad982d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
-
-/*
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC		37
-#define to_ioend_wq(v)	(&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t xfs_ioend_wq[NVSYNC];
-
-void __init
-xfs_ioend_init(void)
-{
-	int i;
-
-	for (i = 0; i < NVSYNC; i++)
-		init_waitqueue_head(&xfs_ioend_wq[i]);
-}
-
-void
-xfs_ioend_wait(
-	xfs_inode_t	*ip)
-{
-	wait_queue_head_t *wq = to_ioend_wq(ip);
-
-	wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-STATIC void
-xfs_ioend_wake(
-	xfs_inode_t	*ip)
-{
-	if (atomic_dec_and_test(&ip->i_iocount))
-		wake_up(to_ioend_wq(ip));
-}
-
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -115,25 +81,20 @@ xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
 	struct buffer_head	*bh, *next;
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 
-	/*
-	 * Volume managers supporting multiple paths can send back ENODEV
-	 * when the final path disappears.  In this case continuing to fill
-	 * the page cache with dirty data which cannot be written out is
-	 * evil, so prevent that.
-	 */
-	if (unlikely(ioend->io_error == -ENODEV)) {
-		xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
-				      __FILE__, __LINE__);
+	if (ioend->io_iocb) {
+		if (ioend->io_isasync) {
+			aio_complete(ioend->io_iocb, ioend->io_error ?
+					ioend->io_error : ioend->io_result, 0);
+		}
+		inode_dio_done(ioend->io_inode);
 	}
 
-	xfs_ioend_wake(ip);
 	mempool_free(ioend, xfs_ioend_pool);
 }
 
@@ -156,6 +117,15 @@ xfs_ioend_new_eof(
 }
 
 /*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+	return ioend->io_offset + ioend->io_size >
+		XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
+/*
  * Update on-disk file size now that data has been written to disk.  The
  * current in-memory file size is i_size.  If a write is beyond eof i_new_size
  * will be the intended file size until i_size is updated.  If this write does
@@ -173,9 +143,6 @@ xfs_setfilesize(
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 
-	if (unlikely(ioend->io_error))
-		return 0;
-
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 		return EAGAIN;
 
@@ -192,6 +159,9 @@ xfs_setfilesize(
 
 /*
  * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
  */
 STATIC void
 xfs_finish_ioend(
@@ -200,8 +170,10 @@ xfs_finish_ioend(
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
 		if (ioend->io_type == IO_UNWRITTEN)
 			queue_work(xfsconvertd_workqueue, &ioend->io_work);
-		else
+		else if (xfs_ioend_is_append(ioend))
 			queue_work(xfsdatad_workqueue, &ioend->io_work);
+		else
+			xfs_destroy_ioend(ioend);
 	}
 }
 
@@ -216,17 +188,24 @@ xfs_end_io(
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		error = -EIO;
+		goto done;
+	}
+	if (ioend->io_error)
+		goto done;
+
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
-	if (ioend->io_type == IO_UNWRITTEN &&
-	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-
+	if (ioend->io_type == IO_UNWRITTEN) {
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						 ioend->io_size);
-		if (error)
-			ioend->io_error = error;
+		if (error) {
+			ioend->io_error = -error;
+			goto done;
+		}
 	}
 
 	/*
@@ -236,6 +215,7 @@ xfs_end_io(
 	error = xfs_setfilesize(ioend);
 	ASSERT(!error || error == EAGAIN);
 
+done:
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
 	 * tail of the workqueue for another attempt later. Otherwise destroy
@@ -247,8 +227,6 @@ xfs_end_io(
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
 	} else {
-		if (ioend->io_iocb)
-			aio_complete(ioend->io_iocb, ioend->io_result, 0);
 		xfs_destroy_ioend(ioend);
 	}
 }
@@ -285,13 +263,13 @@ xfs_alloc_ioend(
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_isasync = 0;
 	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
 	ioend->io_buffer_tail = NULL;
-	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 	ioend->io_iocb = NULL;
@@ -337,8 +315,8 @@ xfs_map_blocks(
 		count = mp->m_maxioffset - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-			  bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				imap, &nimaps, bmapi_flags);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (error)
@@ -551,7 +529,6 @@ xfs_cancel_ioend(
 			unlock_buffer(bh);
 		} while ((bh = next_bh) != NULL);
 
-		xfs_ioend_wake(XFS_I(ioend->io_inode));
 		mempool_free(ioend, xfs_ioend_pool);
 	} while ((ioend = next) != NULL);
 }
@@ -1161,8 +1138,8 @@ __xfs_get_blocks(
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-	error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-			  XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, XFS_BMAPI_ENTIRE);
 	if (error)
 		goto out_unlock;
 
@@ -1300,7 +1277,6 @@ xfs_end_io_direct_write(
 	bool			is_async)
 {
 	struct xfs_ioend	*ioend = iocb->private;
-	struct inode		*inode = ioend->io_inode;
 
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
@@ -1311,28 +1287,17 @@ xfs_end_io_direct_write(
 
 	ioend->io_offset = offset;
 	ioend->io_size = size;
+	ioend->io_iocb = iocb;
+	ioend->io_result = ret;
 	if (private && size > 0)
 		ioend->io_type = IO_UNWRITTEN;
 
 	if (is_async) {
-		/*
-		 * If we are converting an unwritten extent we need to delay
-		 * the AIO completion until after the unwrittent extent
-		 * conversion has completed, otherwise do it ASAP.
-		 */
-		if (ioend->io_type == IO_UNWRITTEN) {
-			ioend->io_iocb = iocb;
-			ioend->io_result = ret;
-		} else {
-			aio_complete(iocb, ret, 0);
-		}
+		ioend->io_isasync = 1;
 		xfs_finish_ioend(ioend);
 	} else {
 		xfs_finish_ioend_sync(ioend);
 	}
-
-	/* XXX: probably should move into the real I/O completion handler */
-	inode_dio_done(inode);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71..116dd5c3703 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
+	unsigned int		io_isasync : 1;	/* needs aio_complete */
 	struct inode		*io_inode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
@@ -60,9 +61,6 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
-extern void xfs_ioend_init(void);
-extern void xfs_ioend_wait(struct xfs_inode *);
-
 extern void xfs_count_page_state(struct page *, int *, int *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 160bcdc34a6..1e5d97f86ea 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -319,7 +319,7 @@ xfs_attr_set_int(
 		return (error);
 	}
 
-	xfs_trans_ijoin(args.trans, dp);
+	xfs_trans_ijoin(args.trans, dp, 0);
 
 	/*
 	 * If the attribute list is non-existent or a shortform list,
@@ -389,7 +389,7 @@ xfs_attr_set_int(
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args.trans, dp);
+			xfs_trans_ijoin(args.trans, dp, 0);
 
 		/*
 		 * Commit the leaf transformation.  We'll need another (linked)
@@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	 * No need to make quota reservations here. We expect to release some
 	 * blocks not allocate in the common case.
 	 */
-	xfs_trans_ijoin(args.trans, dp);
+	xfs_trans_ijoin(args.trans, dp, 0);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	 * No need to make quota reservations here. We expect to release some
 	 * blocks, not allocate, in the common case.
 	 */
-	xfs_trans_ijoin(trans, dp);
+	xfs_trans_ijoin(trans, dp, 0);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	if (error)
 		goto out;
 
-	/*
-	 * Signal synchronous inactive transactions unless this is a
-	 * synchronous mount filesystem in which case we know that we're here
-	 * because we've been called out of xfs_inactive which means that the
-	 * last reference is gone and the unlink transaction has already hit
-	 * the disk so async inactive transactions are safe.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
-		if (dp->i_d.di_anextents > 0)
-			xfs_trans_set_sync(trans);
-	}
-
 	error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
 	if (error)
 		goto out;
@@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 
 		/*
 		 * Commit the current trans (including the inode) and start
@@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 		} else
 			xfs_da_buf_done(bp);
 
@@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 	} else
 		xfs_da_buf_done(bp);
 	return(0);
@@ -1303,7 +1291,7 @@ restart:
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 
 			/*
 			 * Commit the node conversion and start the next
@@ -1340,7 +1328,7 @@ restart:
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 	} else {
 		/*
 		 * Addition succeeded, update Btree hashvals.
@@ -1452,7 +1440,7 @@ restart:
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 		}
 
 		/*
@@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 
 		/*
 		 * Commit the Btree join operation and start a new trans.
@@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 			 * in all transactions.
 			 */
 			if (committed)
-				xfs_trans_ijoin(args->trans, dp);
+				xfs_trans_ijoin(args->trans, dp, 0);
 		} else
 			xfs_da_brelse(args->trans, bp);
 	}
@@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 	lblkno = args->rmtblkno;
 	while (valuelen > 0) {
 		nmap = ATTR_RMTVALUE_MAPSIZE;
-		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
-				  args->rmtblkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  NULL, 0, map, &nmap, NULL);
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       args->rmtblkcnt, map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
 		if (error)
 			return(error);
 		ASSERT(nmap >= 1);
@@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
-		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
 				  blkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
-							XFS_BMAPI_WRITE,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 				  args->firstblock, args->total, &map, &nmap,
 				  args->flist);
 		if (!error) {
@@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, dp);
+			xfs_trans_ijoin(args->trans, dp, 0);
 
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
-		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
-				  args->rmtblkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  args->firstblock, 0, &map, &nmap,
-				  NULL);
-		if (error) {
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+				       args->rmtblkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
 			return(error);
-		}
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
@@ -2121,16 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 
 		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
 				 XBF_LOCK | XBF_DONT_BLOCK);
-		ASSERT(!xfs_buf_geterror(bp));
-
+		if (!bp)
+			return ENOMEM;
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
 		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
 			xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
-		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
-			return (error);
-		}
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
 		src += tmp;
 		valuelen -= tmp;
 
@@ -2166,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * Try to remember where we decided to put the value.
 		 */
-		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
-		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
-					args->rmtblkcnt,
-					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					args->firstblock, 0, &map, &nmap,
-					args->flist);
-		if (error) {
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       args->rmtblkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
 			return(error);
-		}
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
@@ -2188,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		 */
 		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
-			XFS_BUF_STALE(bp);
-			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_stale(bp);
 			xfs_buf_relse(bp);
 			bp = NULL;
 		}
@@ -2227,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		 * a new one.  We need the inode to be in all transactions.
 		 */
 		if (committed)
-			xfs_trans_ijoin(args->trans, args->dp);
+			xfs_trans_ijoin(args->trans, args->dp, 0);
 
 		/*
 		 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8fad9602542..d4906e7c978 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 		 * Try to remember where we decided to put the value.
 		 */
 		nmap = 1;
-		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
-					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					NULL, 0, &map, &nmap, NULL);
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+				       &map, &nmap, XFS_BMAPI_ATTRFORK);
 		if (error) {
 			return(error);
 		}
@@ -2948,6 +2947,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 			bp = xfs_trans_get_buf(*trans,
 					dp->i_mount->m_ddev_targp,
 					dblkno, dblkcnt, XBF_LOCK);
+			if (!bp)
+				return ENOMEM;
 			xfs_trans_binval(*trans, bp);
 			/*
 			 * Roll to next transaction.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 452a291383a..c68baeb0974 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -50,17 +50,22 @@
 #include "xfs_trace.h"
 
 
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
-#endif
-
 kmem_zone_t		*xfs_bmap_free_item_zone;
 
 /*
  * Prototypes for internal bmap routines.
  */
 
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(
+	struct xfs_btree_cur	*cur,
+	struct xfs_inode	*ip,
+	int			whichfork);
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+#endif
+
 
 /*
  * Called from xfs_bmap_add_attrfork to handle extents format files.
@@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local(
 	int			*flags);	/* inode logging flags */
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_delay_real(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp); /* inode logging flags */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_hole_delay(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_hole_real(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork); /* data or attr fork */
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_unwritten_real(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
-
-/*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
  * It figures out where to ask the underlying allocator to put the new extent.
  */
@@ -215,19 +168,6 @@ xfs_bmap_search_extents(
 	xfs_bmbt_irec_t	*prevp);	/* out: previous extent entry found */
 
 /*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int				/* error */
-xfs_bmap_isaeof(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t   off,		/* file offset in fsblocks */
-	int             whichfork,	/* data or attribute fork */
-	char		*aeof);		/* return value */
-
-/*
  * Compute the worst-case number of indirect blocks that will be used
  * for ip's delayed extent of length "len".
  */
@@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local(
 }
 
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int				/* error */
-xfs_bmap_add_extent(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
-{
-	xfs_btree_cur_t		*cur;	/* btree cursor or null */
-	xfs_filblks_t		da_new; /* new count del alloc blocks used */
-	xfs_filblks_t		da_old; /* old count del alloc blocks used */
-	int			error;	/* error return value */
-	xfs_ifork_t		*ifp;	/* inode fork ptr */
-	int			logflags; /* returned value */
-	xfs_extnum_t		nextents; /* number of extents in file now */
-
-	XFS_STATS_INC(xs_add_exlist);
-
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	da_old = da_new = 0;
-	error = 0;
-
-	ASSERT(*idx >= 0);
-	ASSERT(*idx <= nextents);
-
-	/*
-	 * This is the first extent added to a new/empty file.
-	 * Special case this one, so other routines get to assume there are
-	 * already extents in the list.
-	 */
-	if (nextents == 0) {
-		xfs_iext_insert(ip, *idx, 1, new,
-				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-
-		ASSERT(cur == NULL);
-
-		if (!isnullstartblock(new->br_startblock)) {
-			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-		} else
-			logflags = 0;
-	}
-	/*
-	 * Any kind of new delayed allocation goes here.
-	 */
-	else if (isnullstartblock(new->br_startblock)) {
-		if (cur)
-			ASSERT((cur->bc_private.b.flags &
-				XFS_BTCUR_BPRV_WASDEL) == 0);
-		error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-						       &logflags);
-	}
-	/*
-	 * Real allocation off the end of the file.
-	 */
-	else if (*idx == nextents) {
-		if (cur)
-			ASSERT((cur->bc_private.b.flags &
-				XFS_BTCUR_BPRV_WASDEL) == 0);
-		error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, whichfork);
-	} else {
-		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
-
-		/*
-		 * Get the record referred to by idx.
-		 */
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
-		/*
-		 * If it's a real allocation record, and the new allocation ends
-		 * after the start of the referred to record, then we're filling
-		 * in a delayed or unwritten allocation with a real one, or
-		 * converting real back to unwritten.
-		 */
-		if (!isnullstartblock(new->br_startblock) &&
-		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
-			if (prev.br_state != XFS_EXT_UNWRITTEN &&
-			    isnullstartblock(prev.br_startblock)) {
-				da_old = startblockval(prev.br_startblock);
-				if (cur)
-					ASSERT(cur->bc_private.b.flags &
-						XFS_BTCUR_BPRV_WASDEL);
-				error = xfs_bmap_add_extent_delay_real(tp, ip,
-						idx, &cur, new, &da_new,
-						first, flist, &logflags);
-			} else {
-				ASSERT(new->br_state == XFS_EXT_NORM ||
-				       new->br_state == XFS_EXT_UNWRITTEN);
-
-				error = xfs_bmap_add_extent_unwritten_real(ip,
-						idx, &cur, new, &logflags);
-				if (error)
-					goto done;
-			}
-		}
-		/*
-		 * Otherwise we're filling in a hole with an allocation.
-		 */
-		else {
-			if (cur)
-				ASSERT((cur->bc_private.b.flags &
-					XFS_BTCUR_BPRV_WASDEL) == 0);
-			error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, whichfork);
-		}
-	}
-
-	if (error)
-		goto done;
-	ASSERT(*curp == cur || *curp == NULL);
-
-	/*
-	 * Convert to a btree if necessary.
-	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
-		int	tmp_logflags;	/* partial log flag return val */
-
-		ASSERT(cur == NULL);
-		error = xfs_bmap_extents_to_btree(tp, ip, first,
-			flist, &cur, da_old > 0, &tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
-		if (error)
-			goto done;
-	}
-	/*
-	 * Adjust for changes in reserved delayed indirect blocks.
-	 * Nothing to do for disk quotas here.
-	 */
-	if (da_old || da_new) {
-		xfs_filblks_t	nblks;
-
-		nblks = da_new;
-		if (cur)
-			nblks += cur->bc_private.b.allocated;
-		ASSERT(nblks <= da_old);
-		if (nblks < da_old)
-			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-				(int64_t)(da_old - nblks), 0);
-	}
-	/*
-	 * Clear out the allocated field, done with it now in any case.
-	 */
-	if (cur) {
-		cur->bc_private.b.allocated = 0;
-		*curp = cur;
-	}
-done:
-#ifdef DEBUG
-	if (!error)
-		xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
-#endif
-	*logflagsp = logflags;
-	return error;
-}
-
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
+ * Convert a delayed allocation to a real allocation.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp) /* inode logging flags */
+	struct xfs_bmalloca	*bma)
 {
-	xfs_btree_cur_t		*cur;	/* btree cursor */
+	struct xfs_bmbt_irec	*new = &bma->got;
 	int			diff;	/* temp value */
 	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
 	int			error;	/* error return value */
@@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	xfs_filblks_t		temp=0;	/* value for dnew calculations */
-	xfs_filblks_t		temp2=0;/* value for dnew calculations */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	xfs_filblks_t		temp=0;	/* value for da_new calculations */
+	xfs_filblks_t		temp2=0;/* value for da_new calculations */
 	int			tmp_rval;	/* partial logging flags */
 
+	ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
+
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
@@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real(
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	ep = xfs_iext_get_ext(ifp, *idx);
+	ep = xfs_iext_get_ext(ifp, bma->idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	new_endoff = new->br_startoff + new->br_blockcount;
 	ASSERT(PREV.br_startoff <= new->br_startoff);
 	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
 
+	da_old = startblockval(PREV.br_startblock);
+	da_new = 0;
+
 	/*
 	 * Set flags determining what part of the previous delayed allocation
 	 * extent is being replaced by a real allocation.
@@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Check and set flags if this segment has a left neighbor.
 	 * Don't set contiguous if the combined extent would be too large.
 	 */
-	if (*idx > 0) {
+	if (bma->idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
 
 		if (isnullstartblock(LEFT.br_startblock))
 			state |= BMAP_LEFT_DELAY;
@@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
 
 		if (isnullstartblock(RIGHT.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
@@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left and right neighbors are both contiguous with new.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		bma->idx--;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			LEFT.br_blockcount + PREV.br_blockcount +
 			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 2, state);
-		ip->i_d.di_nextents--;
-		if (cur == NULL)
+		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+		bma->ip->i_d.di_nextents--;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
 					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
+					RIGHT.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
 					PREV.br_blockcount +
-					RIGHT.br_blockcount, LEFT.br_state)))
+					RIGHT.br_blockcount, LEFT.br_state);
+			if (error)
 				goto done;
 		}
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The left neighbor is contiguous, the right is not.
 		 */
-		--*idx;
+		bma->idx--;
 
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		if (cur == NULL)
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock, LEFT.br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
-					PREV.br_blockcount, LEFT.br_state)))
+					PREV.br_blockcount, LEFT.br_state);
+			if (error)
 				goto done;
 		}
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in all of a previously delayed allocation extent.
 		 * The right neighbor is contiguous, the left is not.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		xfs_bmbt_set_blockcount(ep,
 			PREV.br_blockcount + RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		if (cur == NULL)
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
 					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
+					RIGHT.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+			error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
 					new->br_startblock,
 					PREV.br_blockcount +
-					RIGHT.br_blockcount, PREV.br_state)))
+					RIGHT.br_blockcount, PREV.br_state);
+			if (error)
 				goto done;
 		}
-
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real(
 		 * Neither the left nor right neighbors are contiguous with
 		 * the new one.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-
-		*dnew = 0;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
 			LEFT.br_blockcount + new->br_blockcount);
 		xfs_bmbt_set_startoff(ep,
 			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
 
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		if (cur == NULL)
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock, LEFT.br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
 					new->br_blockcount,
-					LEFT.br_state)))
+					LEFT.br_state);
+			if (error)
 				goto done;
 		}
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		--*idx;
-		*dnew = temp;
+		bma->idx--;
 		break;
 
 	case BMAP_LEFT_FILLING:
@@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the first part of a previous delayed allocation.
 		 * The left neighbor is not contiguous.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startoff(ep, new_endoff);
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, *idx, 1, new, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
-			error = xfs_bmap_extents_to_btree(tp, ip,
-					first, flist, &cur, 1, &tmp_rval,
-					XFS_DATA_FORK);
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist,
+					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
 		}
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock) -
-			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, *idx + 1);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-
-		*dnew = temp;
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
 		break;
 
 	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is contiguous with the new allocation.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + RIGHT.br_blockcount,
 			RIGHT.br_state);
-		trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-		if (cur == NULL)
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_DEXT;
 		else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
 					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
+					RIGHT.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
 					RIGHT.br_blockcount,
-					RIGHT.br_state)))
+					RIGHT.br_state);
+			if (error)
 				goto done;
 		}
 
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock));
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		++*idx;
-		*dnew = temp;
+		bma->idx++;
 		break;
 
 	case BMAP_RIGHT_FILLING:
@@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real(
 		 * The right neighbor is not contiguous.
 		 */
 		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(ip, *idx + 1, 1, new, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
-			error = xfs_bmap_extents_to_btree(tp, ip,
-				first, flist, &cur, 1, &tmp_rval,
-				XFS_DATA_FORK);
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur, 1,
+				&tmp_rval, XFS_DATA_FORK);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
 		}
-		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock) -
-			(cur ? cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, *idx);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		++*idx;
-		*dnew = temp;
+		bma->idx++;
 		break;
 
 	case 0:
@@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
 		LEFT = *new;
 		RIGHT.br_state = PREV.br_state;
 		RIGHT.br_startblock = nullstartblock(
-				(int)xfs_bmap_worst_indlen(ip, temp2));
+				(int)xfs_bmap_worst_indlen(bma->ip, temp2));
 		RIGHT.br_startoff = new_endoff;
 		RIGHT.br_blockcount = temp2;
 		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-		xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
+		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
 					new->br_startblock, new->br_blockcount,
-					&i)))
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
-			error = xfs_bmap_extents_to_btree(tp, ip,
-					first, flist, &cur, 1, &tmp_rval,
-					XFS_DATA_FORK);
+		if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist, &bma->cur,
+					1, &tmp_rval, XFS_DATA_FORK);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
 		}
-		temp = xfs_bmap_worst_indlen(ip, temp);
-		temp2 = xfs_bmap_worst_indlen(ip, temp2);
+		temp = xfs_bmap_worst_indlen(bma->ip, temp);
+		temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
 		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
-			(cur ? cur->bc_private.b.allocated : 0));
-		if (diff > 0 &&
-		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-					     -((int64_t)diff), 0)) {
-			/*
-			 * Ick gross gag me with a spoon.
-			 */
-			ASSERT(0);	/* want to see if this ever happens! */
-			while (diff > 0) {
-				if (temp) {
-					temp--;
-					diff--;
-					if (!diff ||
-					    !xfs_icsb_modify_counters(ip->i_mount,
-						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), 0))
-						break;
-				}
-				if (temp2) {
-					temp2--;
-					diff--;
-					if (!diff ||
-					    !xfs_icsb_modify_counters(ip->i_mount,
-						    XFS_SBS_FDBLOCKS,
-						    -((int64_t)diff), 0))
-						break;
-				}
-			}
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		if (diff > 0) {
+			error = xfs_icsb_modify_counters(bma->ip->i_mount,
+					XFS_SBS_FDBLOCKS,
+					-((int64_t)diff), 0);
+			ASSERT(!error);
+			if (error)
+				goto done;
 		}
-		ep = xfs_iext_get_ext(ifp, *idx);
+
+		ep = xfs_iext_get_ext(ifp, bma->idx);
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
 			nullstartblock((int)temp2));
-		trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
 
-		++*idx;
-		*dnew = temp + temp2;
+		bma->idx++;
+		da_new = temp + temp2;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		ASSERT(0);
 	}
-	*curp = cur;
+
+	/* convert to a btree if necessary */
+	if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* adjust for changes in reserved delayed indirect blocks */
+	if (da_old || da_new) {
+		temp = da_new;
+		if (bma->cur)
+			temp += bma->cur->bc_private.b.allocated;
+		ASSERT(temp <= da_old);
+		if (temp < da_old)
+			xfs_icsb_modify_counters(bma->ip->i_mount,
+					XFS_SBS_FDBLOCKS,
+					(int64_t)(da_old - temp), 0);
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
 done:
-	*logflagsp = rval;
+	bma->logflags |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -1124,15 +923,17 @@ done:
 }
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
+ * Convert an unwritten allocation to a real allocation or vice versa.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
+	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp) /* inode logging flags */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
@@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real(
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
 
+	*logflagsp = 0;
+
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+
+	XFS_STATS_INC(xs_add_exlist);
+
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
 #define	PREV		r[2]
+
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
 	error = 0;
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &PREV);
 	newext = new->br_state;
@@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			if ((error = xfs_btree_decrement(cur, 0, &i)))
 				goto done;
-			if (xfs_bmbt_update(cur, LEFT.br_startoff,
+			error = xfs_bmbt_update(cur, LEFT.br_startoff,
 				LEFT.br_startblock,
 				LEFT.br_blockcount + new->br_blockcount,
-				LEFT.br_state))
+				LEFT.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real(
 		 */
 		ASSERT(0);
 	}
-	*curp = cur;
+
+	/* convert to a btree if necessary */
+	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+				0, &tmp_logflags, XFS_DATA_FORK);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+
+	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
 done:
-	*logflagsp = rval;
+	*logflagsp |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -1617,16 +1449,13 @@ done:
 }
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
+ * Convert a hole to a delayed allocation.
  */
-/*ARGSUSED*/
-STATIC int				/* error */
+STATIC void
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp) /* inode logging flags */
+	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
 {
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
@@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay(
 		 * Nothing to do for disk quota accounting here.
 		 */
 	}
-	*logflagsp = 0;
-	return 0;
 }
 
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
+ * Convert a hole to a real allocation.
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
 {
+	struct xfs_bmbt_irec	*new = &bma->got;
 	int			error;	/* error return value */
 	int			i;	/* temp state */
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
@@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real(
 	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
 
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-	state = 0;
+	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
 
+	state = 0;
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
 
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 */
-	if (*idx > 0) {
+	if (bma->idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
 	}
@@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real(
 		 * left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			left.br_blockcount + new->br_blockcount +
 			right.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		xfs_iext_remove(ip, *idx + 1, 1, state);
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
 
-		XFS_IFORK_NEXT_SET(ip, whichfork,
-			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-		if (cur == NULL) {
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+		if (bma->cur == NULL) {
 			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur,
-					right.br_startoff,
-					right.br_startblock,
-					right.br_blockcount, &i)))
+			error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+					right.br_startblock, right.br_blockcount,
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
 						new->br_blockcount +
 						right.br_blockcount,
-					left.br_state)))
+					left.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		--*idx;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
 			left.br_blockcount + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		if (cur == NULL) {
+		if (bma->cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur,
-					left.br_startoff,
-					left.br_startblock,
-					left.br_blockcount, &i)))
+			error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+					left.br_startblock, left.br_blockcount,
+					&i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
 						new->br_blockcount,
-					left.br_state)))
+					left.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + right.br_blockcount,
 			right.br_state);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		if (cur == NULL) {
+		if (bma->cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur,
+			error = xfs_bmbt_lookup_eq(bma->cur,
 					right.br_startoff,
 					right.br_startblock,
-					right.br_blockcount, &i)))
+					right.br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
 						right.br_blockcount,
-					right.br_state)))
+					right.br_state);
+			if (error)
 				goto done;
 		}
 		break;
@@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real(
 		 * real allocation.
 		 * Insert a new entry.
 		 */
-		xfs_iext_insert(ip, *idx, 1, new, state);
-		XFS_IFORK_NEXT_SET(ip, whichfork,
-			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
-		if (cur == NULL) {
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+		if (bma->cur == NULL) {
 			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur,
+			error = xfs_bmbt_lookup_eq(bma->cur,
 					new->br_startoff,
 					new->br_startblock,
-					new->br_blockcount, &i)))
+					new->br_blockcount, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = new->br_state;
-			if ((error = xfs_btree_insert(cur, &i)))
+			bma->cur->bc_rec.b.br_state = new->br_state;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		break;
 	}
+
+	/* convert to a btree if necessary */
+	if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				0, &tmp_logflags, whichfork);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
 done:
-	*logflagsp = rval;
+	bma->logflags |= rval;
 	return error;
 }
 
@@ -2160,26 +2018,26 @@ xfs_bmap_adjacent(
 		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
 
 	mp = ap->ip->i_mount;
-	nullfb = ap->firstblock == NULLFSBLOCK;
+	nullfb = *ap->firstblock == NULLFSBLOCK;
 	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
-	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	/*
 	 * If allocating at eof, and there's a previous real block,
 	 * try to use its last block as our starting point.
 	 */
-	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
-	    !isnullstartblock(ap->prevp->br_startblock) &&
-	    ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
-		    ap->prevp->br_startblock)) {
-		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+	if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+	    !isnullstartblock(ap->prev.br_startblock) &&
+	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+		    ap->prev.br_startblock)) {
+		ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
 		/*
 		 * Adjust for the gap between prevp and us.
 		 */
-		adjust = ap->off -
-			(ap->prevp->br_startoff + ap->prevp->br_blockcount);
+		adjust = ap->offset -
+			(ap->prev.br_startoff + ap->prev.br_blockcount);
 		if (adjust &&
-		    ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
-			ap->rval += adjust;
+		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+			ap->blkno += adjust;
 	}
 	/*
 	 * If not at eof, then compare the two neighbor blocks.
@@ -2196,17 +2054,17 @@ xfs_bmap_adjacent(
 		 * If there's a previous (left) block, select a requested
 		 * start block based on it.
 		 */
-		if (ap->prevp->br_startoff != NULLFILEOFF &&
-		    !isnullstartblock(ap->prevp->br_startblock) &&
-		    (prevbno = ap->prevp->br_startblock +
-			       ap->prevp->br_blockcount) &&
-		    ISVALID(prevbno, ap->prevp->br_startblock)) {
+		if (ap->prev.br_startoff != NULLFILEOFF &&
+		    !isnullstartblock(ap->prev.br_startblock) &&
+		    (prevbno = ap->prev.br_startblock +
+			       ap->prev.br_blockcount) &&
+		    ISVALID(prevbno, ap->prev.br_startblock)) {
 			/*
 			 * Calculate gap to end of previous block.
 			 */
-			adjust = prevdiff = ap->off -
-				(ap->prevp->br_startoff +
-				 ap->prevp->br_blockcount);
+			adjust = prevdiff = ap->offset -
+				(ap->prev.br_startoff +
+				 ap->prev.br_blockcount);
 			/*
 			 * Figure the startblock based on the previous block's
 			 * end and the gap size.
@@ -2215,9 +2073,9 @@ xfs_bmap_adjacent(
 			 * allocating, or using it gives us an invalid block
 			 * number, then just use the end of the previous block.
 			 */
-			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
 			    ISVALID(prevbno + prevdiff,
-				    ap->prevp->br_startblock))
+				    ap->prev.br_startblock))
 				prevbno += adjust;
 			else
 				prevdiff += adjust;
@@ -2238,16 +2096,16 @@ xfs_bmap_adjacent(
 		 * If there's a following (right) block, select a requested
 		 * start block based on it.
 		 */
-		if (!isnullstartblock(ap->gotp->br_startblock)) {
+		if (!isnullstartblock(ap->got.br_startblock)) {
 			/*
 			 * Calculate gap to start of next block.
 			 */
-			adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+			adjust = gotdiff = ap->got.br_startoff - ap->offset;
 			/*
 			 * Figure the startblock based on the next block's
 			 * start and the gap size.
 			 */
-			gotbno = ap->gotp->br_startblock;
+			gotbno = ap->got.br_startblock;
 			/*
 			 * Heuristic!
 			 * If the gap is large relative to the piece we're
@@ -2255,12 +2113,12 @@ xfs_bmap_adjacent(
 			 * number, then just use the start of the next block
 			 * offset by our length.
 			 */
-			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
 			    ISVALID(gotbno - gotdiff, gotbno))
 				gotbno -= adjust;
-			else if (ISVALID(gotbno - ap->alen, gotbno)) {
-				gotbno -= ap->alen;
-				gotdiff += adjust - ap->alen;
+			else if (ISVALID(gotbno - ap->length, gotbno)) {
+				gotbno -= ap->length;
+				gotdiff += adjust - ap->length;
 			} else
 				gotdiff += adjust;
 			/*
@@ -2278,14 +2136,14 @@ xfs_bmap_adjacent(
 			gotbno = NULLFSBLOCK;
 		/*
 		 * If both valid, pick the better one, else the only good
-		 * one, else ap->rval is already set (to 0 or the inode block).
+		 * one, else ap->blkno is already set (to 0 or the inode block).
 		 */
 		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
-			ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
 		else if (prevbno != NULLFSBLOCK)
-			ap->rval = prevbno;
+			ap->blkno = prevbno;
 		else if (gotbno != NULLFSBLOCK)
-			ap->rval = gotbno;
+			ap->blkno = gotbno;
 	}
 #undef ISVALID
 }
@@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc(
 	mp = ap->ip->i_mount;
 	align = xfs_get_extsz_hint(ap->ip);
 	prod = align / mp->m_sb.sb_rextsize;
-	error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 					align, 1, ap->eof, 0,
-					ap->conv, &ap->off, &ap->alen);
+					ap->conv, &ap->offset, &ap->length);
 	if (error)
 		return error;
-	ASSERT(ap->alen);
-	ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+	ASSERT(ap->length);
+	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
 
 	/*
 	 * If the offset & length are not perfectly aligned
 	 * then kill prod, it will just get us in trouble.
 	 */
-	if (do_mod(ap->off, align) || ap->alen % align)
+	if (do_mod(ap->offset, align) || ap->length % align)
 		prod = 1;
 	/*
 	 * Set ralen to be the actual requested length in rtextents.
 	 */
-	ralen = ap->alen / mp->m_sb.sb_rextsize;
+	ralen = ap->length / mp->m_sb.sb_rextsize;
 	/*
 	 * If the old value was close enough to MAXEXTLEN that
 	 * we rounded up to it, cut it back so it's valid again.
@@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc(
 	 * Lock out other modifications to the RT bitmap inode.
 	 */
 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If it's an allocation to an empty file at offset 0,
 	 * pick an extent that will space things out in the rt area.
 	 */
-	if (ap->eof && ap->off == 0) {
+	if (ap->eof && ap->offset == 0) {
 		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
 
 		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
 		if (error)
 			return error;
-		ap->rval = rtx * mp->m_sb.sb_rextsize;
+		ap->blkno = rtx * mp->m_sb.sb_rextsize;
 	} else {
-		ap->rval = 0;
+		ap->blkno = 0;
 	}
 
 	xfs_bmap_adjacent(ap);
@@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc(
 	/*
 	 * Realtime allocation, done through xfs_rtallocate_extent.
 	 */
-	atype = ap->rval == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
-	do_div(ap->rval, mp->m_sb.sb_rextsize);
-	rtb = ap->rval;
-	ap->alen = ralen;
-	if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+	do_div(ap->blkno, mp->m_sb.sb_rextsize);
+	rtb = ap->blkno;
+	ap->length = ralen;
+	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
 				&ralen, atype, ap->wasdel, prod, &rtb)))
 		return error;
 	if (rtb == NULLFSBLOCK && prod > 1 &&
-	    (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
-					   ap->alen, &ralen, atype,
+	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+					   ap->length, &ralen, atype,
 					   ap->wasdel, 1, &rtb)))
 		return error;
-	ap->rval = rtb;
-	if (ap->rval != NULLFSBLOCK) {
-		ap->rval *= mp->m_sb.sb_rextsize;
+	ap->blkno = rtb;
+	if (ap->blkno != NULLFSBLOCK) {
+		ap->blkno *= mp->m_sb.sb_rextsize;
 		ralen *= mp->m_sb.sb_rextsize;
-		ap->alen = ralen;
+		ap->length = ralen;
 		ap->ip->i_d.di_nblocks += ralen;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
@@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc(
 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
 	} else {
-		ap->alen = 0;
+		ap->length = 0;
 	}
 	return 0;
 }
@@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb(
 	 * AG as the stream may have moved.
 	 */
 	if (xfs_inode_is_filestream(ap->ip))
-		ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+		ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
 
 	return 0;
 }
@@ -2528,52 +2386,52 @@ xfs_bmap_btalloc(
 	mp = ap->ip->i_mount;
 	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
 	if (unlikely(align)) {
-		error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 						align, 0, ap->eof, 0, ap->conv,
-						&ap->off, &ap->alen);
+						&ap->offset, &ap->length);
 		ASSERT(!error);
-		ASSERT(ap->alen);
+		ASSERT(ap->length);
 	}
-	nullfb = ap->firstblock == NULLFSBLOCK;
-	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+	nullfb = *ap->firstblock == NULLFSBLOCK;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	if (nullfb) {
 		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
 			ag = xfs_filestream_lookup_ag(ap->ip);
 			ag = (ag != NULLAGNUMBER) ? ag : 0;
-			ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
+			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
 		} else {
-			ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+			ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
 		}
 	} else
-		ap->rval = ap->firstblock;
+		ap->blkno = *ap->firstblock;
 
 	xfs_bmap_adjacent(ap);
 
 	/*
-	 * If allowed, use ap->rval; otherwise must use firstblock since
+	 * If allowed, use ap->blkno; otherwise must use firstblock since
 	 * it's in the right allocation group.
 	 */
-	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
 		;
 	else
-		ap->rval = ap->firstblock;
+		ap->blkno = *ap->firstblock;
 	/*
 	 * Normal allocation, done through xfs_alloc_vextent.
 	 */
 	tryagain = isaligned = 0;
 	args.tp = ap->tp;
 	args.mp = mp;
-	args.fsbno = ap->rval;
+	args.fsbno = ap->blkno;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
-	args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
-	args.firstblock = ap->firstblock;
+	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+	args.firstblock = *ap->firstblock;
 	blen = 0;
 	if (nullfb) {
 		error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
 		if (error)
 			return error;
-	} else if (ap->low) {
+	} else if (ap->flist->xbf_low) {
 		if (xfs_inode_is_filestream(ap->ip))
 			args.type = XFS_ALLOCTYPE_FIRST_AG;
 		else
@@ -2587,14 +2445,14 @@ xfs_bmap_btalloc(
 	/* apply extent size hints if obtained earlier */
 	if (unlikely(align)) {
 		args.prod = align;
-		if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
 	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
 		args.prod = 1;
 		args.mod = 0;
 	} else {
 		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
-		if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+		if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
 	}
 	/*
@@ -2606,8 +2464,8 @@ xfs_bmap_btalloc(
 	 * is >= the stripe unit and the allocation offset is
 	 * at the end of file.
 	 */
-	if (!ap->low && ap->aeof) {
-		if (!ap->off) {
+	if (!ap->flist->xbf_low && ap->aeof) {
+		if (!ap->offset) {
 			args.alignment = mp->m_dalign;
 			atype = args.type;
 			isaligned = 1;
@@ -2660,7 +2518,7 @@ xfs_bmap_btalloc(
 		 * turned on.
 		 */
 		args.type = atype;
-		args.fsbno = ap->rval;
+		args.fsbno = ap->blkno;
 		args.alignment = mp->m_dalign;
 		args.minlen = nextminlen;
 		args.minalignslop = 0;
@@ -2674,7 +2532,7 @@ xfs_bmap_btalloc(
 		 * try again.
 		 */
 		args.type = atype;
-		args.fsbno = ap->rval;
+		args.fsbno = ap->blkno;
 		args.alignment = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
@@ -2683,7 +2541,7 @@ xfs_bmap_btalloc(
 	    args.minlen > ap->minlen) {
 		args.minlen = ap->minlen;
 		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = ap->rval;
+		args.fsbno = ap->blkno;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
@@ -2694,13 +2552,26 @@ xfs_bmap_btalloc(
 		args.minleft = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
-		ap->low = 1;
+		ap->flist->xbf_low = 1;
 	}
 	if (args.fsbno != NULLFSBLOCK) {
-		ap->firstblock = ap->rval = args.fsbno;
+		/*
+		 * check the allocation happened at the same or higher AG than
+		 * the first block that was allocated.
+		 */
+		ASSERT(*ap->firstblock == NULLFSBLOCK ||
+		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+		       (ap->flist->xbf_low &&
+			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+			XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+		ap->blkno = args.fsbno;
+		if (*ap->firstblock == NULLFSBLOCK)
+			*ap->firstblock = args.fsbno;
 		ASSERT(nullfb || fb_agno == args.agno ||
-		       (ap->low && fb_agno < args.agno));
-		ap->alen = args.len;
+		       (ap->flist->xbf_low && fb_agno < args.agno));
+		ap->length = args.len;
 		ap->ip->i_d.di_nblocks += args.len;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
@@ -2714,8 +2585,8 @@ xfs_bmap_btalloc(
 					XFS_TRANS_DQ_BCOUNT,
 			(long) args.len);
 	} else {
-		ap->rval = NULLFSBLOCK;
-		ap->alen = 0;
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
 	}
 	return 0;
 }
@@ -3589,7 +3460,7 @@ xfs_bmap_add_attrfork(
 	}
 	ASSERT(ip->i_d.di_anextents == 0);
 
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	switch (ip->i_d.di_format) {
@@ -3782,19 +3653,11 @@ xfs_bmap_compute_maxlevels(
  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  * caller.  Frees all the extents that need freeing, which must be done
  * last due to locking considerations.  We never free any extents in
- * the first transaction.  This is to allow the caller to make the first
- * transaction a synchronous one so that the pointers to the data being
- * broken in this transaction will be permanent before the data is actually
- * freed.  This is necessary to prevent blocks from being reallocated
- * and written to before the free and reallocation are actually permanent.
- * We do not just make the first transaction synchronous here, because
- * there are more efficient ways to gain the same protection in some cases
- * (see the file truncation code).
+ * the first transaction.
  *
  * Return 1 if the given transaction was committed and a new one
  * started, and 0 otherwise in the committed parameter.
  */
-/*ARGSUSED*/
 int						/* error */
 xfs_bmap_finish(
 	xfs_trans_t		**tp,		/* transaction pointer addr */
@@ -3994,42 +3857,122 @@ xfs_bmap_last_before(
 	return 0;
 }
 
+STATIC int
+xfs_bmap_last_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*rec,
+	int			*is_empty)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			error;
+	int			nextents;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*is_empty = 1;
+		return 0;
+	}
+
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+	*is_empty = 0;
+	return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	bma->aeof = 0;
+	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+				     &is_empty);
+	if (error || is_empty)
+		return error;
+
+	/*
+	 * Check if we are allocation or past the last extent, or at least into
+	 * the last delayed allocated extent.
+	 */
+	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+		(bma->offset >= rec.br_startoff &&
+		 isnullstartblock(rec.br_startblock));
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		endoff,
+	int			whichfork,
+	int			*eof)
+{
+	struct xfs_bmbt_irec	rec;
+	int			error;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+	if (error || *eof)
+		return error;
+
+	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
 /*
  * Returns the file-relative block number of the first block past eof in
  * the file.  This is not based on i_size, it is based on the extent records.
  * Returns 0 for local files, as they do not have extent records.
  */
-int						/* error */
+int
 xfs_bmap_last_offset(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_fileoff_t	*last_block,		/* last block */
-	int		whichfork)		/* data or attr fork */
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*last_block,
+	int			whichfork)
 {
-	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
-	int		error;			/* error return value */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	nextents;		/* number of extent entries */
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*last_block = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+		return 0;
 
 	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
 	       return XFS_ERROR(EIO);
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		*last_block = 0;
-		return 0;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
 		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (!nextents) {
-		*last_block = 0;
-		return 0;
-	}
-	ep = xfs_iext_get_ext(ifp, nextents - 1);
-	*last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+
+	*last_block = rec.br_startoff + rec.br_blockcount;
 	return 0;
 }
 
@@ -4159,7 +4102,6 @@ xfs_bmap_read_extents(
 		xfs_extnum_t	num_recs;
 		xfs_extnum_t	start;
 
-
 		num_recs = xfs_btree_get_numrecs(block);
 		if (unlikely(i + num_recs > room)) {
 			ASSERT(i + num_recs <= room);
@@ -4282,9 +4224,8 @@ xfs_bmap_validate_ret(
 		ASSERT(i == 0 ||
 		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
 		       mval[i].br_startoff);
-		if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
-			ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
-			       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+		       mval[i].br_startblock != HOLESTARTBLOCK);
 		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
 		       mval[i].br_state == XFS_EXT_UNWRITTEN);
 	}
@@ -4293,66 +4234,609 @@ xfs_bmap_validate_ret(
 
 
 /*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+	struct xfs_bmbt_irec	*mval,
+	struct xfs_bmbt_irec	*got,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			n,
+	int			flags)
+{
+	if ((flags & XFS_BMAPI_ENTIRE) ||
+	    got->br_startoff + got->br_blockcount <= obno) {
+		*mval = *got;
+		if (isnullstartblock(got->br_startblock))
+			mval->br_startblock = DELAYSTARTBLOCK;
+		return;
+	}
+
+	if (obno > *bno)
+		*bno = obno;
+	ASSERT((*bno >= obno) || (n == 0));
+	ASSERT(*bno < end);
+	mval->br_startoff = *bno;
+	if (isnullstartblock(got->br_startblock))
+		mval->br_startblock = DELAYSTARTBLOCK;
+	else
+		mval->br_startblock = got->br_startblock +
+					(*bno - got->br_startoff);
+	/*
+	 * Return the minimum of what we got and what we asked for for
+	 * the length.  We can use the len variable here because it is
+	 * modified below and we could have been there before coming
+	 * here if the first part of the allocation didn't overlap what
+	 * was asked for.
+	 */
+	mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+			got->br_blockcount - (*bno - got->br_startoff));
+	mval->br_state = got->br_state;
+	ASSERT(mval->br_blockcount <= len);
+	return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+	struct xfs_bmbt_irec	**map,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			*n,
+	int			flags)
+{
+	xfs_bmbt_irec_t	*mval = *map;
+
+	ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+	       ((mval->br_startoff + mval->br_blockcount) <= end));
+	ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+	       (mval->br_startoff < obno));
+
+	*bno = mval->br_startoff + mval->br_blockcount;
+	*len = end - *bno;
+	if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+		/* update previous map with new information */
+		ASSERT(mval->br_startblock == mval[-1].br_startblock);
+		ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+		ASSERT(mval->br_state == mval[-1].br_state);
+		mval[-1].br_blockcount = mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != HOLESTARTBLOCK &&
+		   mval->br_startblock == mval[-1].br_startblock +
+					  mval[-1].br_blockcount &&
+		   ((flags & XFS_BMAPI_IGSTATE) ||
+			mval[-1].br_state == mval->br_state)) {
+		ASSERT(mval->br_startoff ==
+		       mval[-1].br_startoff + mval[-1].br_blockcount);
+		mval[-1].br_blockcount += mval->br_blockcount;
+	} else if (*n > 0 &&
+		   mval->br_startblock == DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+		   mval->br_startoff ==
+		   mval[-1].br_startoff + mval[-1].br_blockcount) {
+		mval[-1].br_blockcount += mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (!((*n == 0) &&
+		     ((mval->br_startoff + mval->br_blockcount) <=
+		      obno))) {
+		mval++;
+		(*n)++;
+	}
+	*map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*mval,
+	int			*nmap,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmbt_irec	got;
+	struct xfs_bmbt_irec	prev;
+	xfs_fileoff_t		obno;
+	xfs_fileoff_t		end;
+	xfs_extnum_t		lastx;
+	int			error;
+	int			eof;
+	int			n = 0;
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+			   XFS_BMAPI_IGSTATE)));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xs_blk_mapr);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		/* Reading past eof, act as though there's a hole up to end. */
+		if (eof)
+			got.br_startoff = end;
+		if (got.br_startoff > bno) {
+			/* Reading in a hole.  */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+	*nmap = n;
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_reserve_delalloc(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		aoff,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_bmbt_irec	*prev,
+	xfs_extnum_t		*lastx,
+	int			eof)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	xfs_extlen_t		alen;
+	xfs_extlen_t		indlen;
+	char			rt = XFS_IS_REALTIME_INODE(ip);
+	xfs_extlen_t		extsz;
+	int			error;
+
+	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+	if (!eof)
+		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+	/* Figure out the extent size, adjust alen */
+	extsz = xfs_get_extsz_hint(ip);
+	if (extsz) {
+		/*
+		 * Make sure we don't exceed a single extent length when we
+		 * align the extent by reducing length we are going to
+		 * allocate by the maximum amount extent size aligment may
+		 * require.
+		 */
+		alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+					       1, 0, &aoff, &alen);
+		ASSERT(!error);
+	}
+
+	if (rt)
+		extsz = alen / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Make a transaction-less quota reservation for delayed allocation
+	 * blocks.  This number gets adjusted later.  We return if we haven't
+	 * allocated blocks already inside this loop.
+	 */
+	error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+			rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		return error;
+
+	/*
+	 * Split changing sb for alen and indlen since they could be coming
+	 * from different places.
+	 */
+	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+	ASSERT(indlen > 0);
+
+	if (rt) {
+		error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+					  -((int64_t)extsz), 0);
+	} else {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 -((int64_t)alen), 0);
+	}
+
+	if (error)
+		goto out_unreserve_quota;
+
+	error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+					 -((int64_t)indlen), 0);
+	if (error)
+		goto out_unreserve_blocks;
+
+
+	ip->i_delayed_blks += alen;
+
+	got->br_startoff = aoff;
+	got->br_startblock = nullstartblock(indlen);
+	got->br_blockcount = alen;
+	got->br_state = XFS_EXT_NORM;
+	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+	 * might have merged it into one of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+	ASSERT(got->br_startoff <= aoff);
+	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+	ASSERT(isnullstartblock(got->br_startblock));
+	ASSERT(got->br_state == XFS_EXT_NORM);
+	return 0;
+
+out_unreserve_blocks:
+	if (rt)
+		xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+	else
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
+				XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+	struct xfs_inode	*ip,	/* incore inode */
+	xfs_fileoff_t		bno,	/* starting file offs. mapped */
+	xfs_filblks_t		len,	/* length to map in file */
+	struct xfs_bmbt_irec	*mval,	/* output: map values */
+	int			*nmap,	/* i/o: mval size/count */
+	int			flags)	/* XFS_BMAPI_... */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_bmbt_irec	got;	/* current file extent record */
+	struct xfs_bmbt_irec	prev;	/* previous file extent record */
+	xfs_fileoff_t		obno;	/* old block number (offset) */
+	xfs_fileoff_t		end;	/* end of mapped file region */
+	xfs_extnum_t		lastx;	/* last useful extent number */
+	int			eof;	/* we've hit the end of extents */
+	int			n = 0;	/* current extent index */
+	int			error = 0;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xs_blk_mapw);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		if (eof || got.br_startoff > bno) {
+			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+							   &prev, &lastx, eof);
+			if (error) {
+				if (n == 0) {
+					*nmap = 0;
+					return error;
+				}
+				break;
+			}
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		prev = got;
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+
+	*nmap = n;
+	return 0;
+}
+
+
+STATIC int
+xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma,
+	int			flags)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+	int			rt;
+
+	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
+
+	/*
+	 * For the wasdelay case, we could also just allocate the stuff asked
+	 * for in this bmap call but that wouldn't be as good.
+	 */
+	if (bma->wasdel) {
+		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+		bma->offset = bma->got.br_startoff;
+		if (bma->idx != NULLEXTNUM && bma->idx) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+					 &bma->prev);
+		}
+	} else {
+		bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+		if (!bma->eof)
+			bma->length = XFS_FILBLKS_MIN(bma->length,
+					bma->got.br_startoff - bma->offset);
+	}
+
+	/*
+	 * Indicate if this is the first user data in the file, or just any
+	 * user data.
+	 */
+	if (!(flags & XFS_BMAPI_METADATA)) {
+		bma->userdata = (bma->offset == 0) ?
+			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+	}
+
+	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+
+	/*
+	 * Only want to do the alignment at the eof if it is userdata and
+	 * allocation length is larger than a stripe unit.
+	 */
+	if (mp->m_dalign && bma->length >= mp->m_dalign &&
+	    !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+		error = xfs_bmap_isaeof(bma, whichfork);
+		if (error)
+			return error;
+	}
+
+	error = xfs_bmap_alloc(bma);
+	if (error)
+		return error;
+
+	if (bma->flist->xbf_low)
+		bma->minleft = 0;
+	if (bma->cur)
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+	if (bma->blkno == NULLFSBLOCK)
+		return 0;
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
+	}
+	/*
+	 * Bump the number of extents we've allocated
+	 * in this call.
+	 */
+	bma->nallocs++;
+
+	if (bma->cur)
+		bma->cur->bc_private.b.flags =
+			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+	bma->got.br_startoff = bma->offset;
+	bma->got.br_startblock = bma->blkno;
+	bma->got.br_blockcount = bma->length;
+	bma->got.br_state = XFS_EXT_NORM;
+
+	/*
+	 * A wasdelay extent has been initialized, so shouldn't be flagged
+	 * as unwritten.
+	 */
+	if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+		bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+	if (bma->wasdel)
+		error = xfs_bmap_add_extent_delay_real(bma);
+	else
+		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
+	 * the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+	ASSERT(bma->got.br_startoff <= bma->offset);
+	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+	       bma->offset + bma->length);
+	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+	       bma->got.br_state == XFS_EXT_UNWRITTEN);
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+	struct xfs_bmalloca	*bma,
+	struct xfs_bmbt_irec	*mval,
+	xfs_filblks_t		len,
+	int			flags)
+{
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	/* check if we need to do unwritten->real conversion */
+	if (mval->br_state == XFS_EXT_UNWRITTEN &&
+	    (flags & XFS_BMAPI_PREALLOC))
+		return 0;
+
+	/* check if we need to do real->unwritten conversion */
+	if (mval->br_state == XFS_EXT_NORM &&
+	    (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+		return 0;
+
+	/*
+	 * Modify (by adding) the state flag, if writing.
+	 */
+	ASSERT(mval->br_blockcount <= len);
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+					bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
+	}
+	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+			&bma->cur, mval, bma->firstblock, bma->flist,
+			&tmp_logflags);
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that
+	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
+	 * of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+	/*
+	 * We may have combined previously unwritten space with written space,
+	 * so generate another request.
+	 */
+	if (mval->br_blockcount < len)
+		return EAGAIN;
+	return 0;
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
  * The returned value in "firstblock" from the first call in a transaction
  * must be remembered and presented to subsequent calls in "firstblock".
  * An upper bound for the number of blocks to be allocated is supplied to
  * the first call in "total"; if no allocation group has that many free
  * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
  */
-int					/* error */
-xfs_bmapi(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode */
-	xfs_fileoff_t	bno,		/* starting file offs. mapped */
-	xfs_filblks_t	len,		/* length to map in file */
-	int		flags,		/* XFS_BMAPI_... */
-	xfs_fsblock_t	*firstblock,	/* first allocated block
-					   controls a.g. for allocs */
-	xfs_extlen_t	total,		/* total blocks needed */
-	xfs_bmbt_irec_t	*mval,		/* output: map values */
-	int		*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
+int
+xfs_bmapi_write(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	struct xfs_bmap_free	*flist)		/* i/o: list extents to free */
 {
-	xfs_fsblock_t	abno;		/* allocated block number */
-	xfs_extlen_t	alen;		/* allocated extent length */
-	xfs_fileoff_t	aoff;		/* allocated file offset */
-	xfs_bmalloca_t	bma = { 0 };	/* args for xfs_bmap_alloc */
-	xfs_btree_cur_t	*cur;		/* bmap btree cursor */
-	xfs_fileoff_t	end;		/* end of mapped file region */
-	int		eof;		/* we've hit the end of extents */
-	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
-	int		error;		/* error return */
-	xfs_bmbt_irec_t	got;		/* current file extent record */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_extlen_t	indlen;		/* indirect blocks length */
-	xfs_extnum_t	lastx;		/* last useful extent number */
-	int		logflags;	/* flags for transaction logging */
-	xfs_extlen_t	minleft;	/* min blocks left after allocation */
-	xfs_extlen_t	minlen;		/* min allocation size */
-	xfs_mount_t	*mp;		/* xfs mount structure */
-	int		n;		/* current extent index */
-	int		nallocs;	/* number of extents alloc'd */
-	xfs_extnum_t	nextents;	/* number of extents in file */
-	xfs_fileoff_t	obno;		/* old block number (offset) */
-	xfs_bmbt_irec_t	prev;		/* previous file extent record */
-	int		tmp_logflags;	/* temp flags holder */
-	int		whichfork;	/* data or attr fork */
-	char		inhole;		/* current location is hole in file */
-	char		wasdelay;	/* old extent was delayed */
-	char		wr;		/* this is a write request */
-	char		rt;		/* this is a realtime file */
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmalloca	bma = { 0 };	/* args for xfs_bmap_alloc */
+	xfs_fileoff_t		end;		/* end of mapped file region */
+	int			eof;		/* after the end of extents */
+	int			error;		/* error return */
+	int			n;		/* current extent index */
+	xfs_fileoff_t		obno;		/* old block number (offset) */
+	int			whichfork;	/* data or attr fork */
+	char			inhole;		/* current location is hole in file */
+	char			wasdelay;	/* old extent was delayed */
+
 #ifdef DEBUG
-	xfs_fileoff_t	orig_bno;	/* original block number value */
-	int		orig_flags;	/* original flags arg value */
-	xfs_filblks_t	orig_len;	/* original value of len arg */
-	xfs_bmbt_irec_t	*orig_mval;	/* original value of mval */
-	int		orig_nmap;	/* original value of *nmap */
+	xfs_fileoff_t		orig_bno;	/* original block number value */
+	int			orig_flags;	/* original flags arg value */
+	xfs_filblks_t		orig_len;	/* original value of len arg */
+	struct xfs_bmbt_irec	*orig_mval;	/* original value of mval */
+	int			orig_nmap;	/* original value of *nmap */
 
 	orig_bno = bno;
 	orig_len = len;
@@ -4360,488 +4844,133 @@ xfs_bmapi(
 	orig_mval = mval;
 	orig_nmap = *nmap;
 #endif
+
 	ASSERT(*nmap >= 1);
-	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+	ASSERT(tp != NULL);
+
 	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
 		XFS_ATTR_FORK : XFS_DATA_FORK;
-	mp = ip->i_mount;
+
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+		XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
-	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
-	if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
-		XFS_STATS_INC(xs_blk_mapw);
-	else
-		XFS_STATS_INC(xs_blk_mapr);
-	/*
-	 * IGSTATE flag is used to combine extents which
-	 * differ only due to the state of the extents.
-	 * This technique is used from xfs_getbmap()
-	 * when the caller does not wish to see the
-	 * separation (which is the default).
-	 *
-	 * This technique is also used when writing a
-	 * buffer which has been partially written,
-	 * (usually by being flushed during a chunkread),
-	 * to ensure one write takes place. This also
-	 * prevents a change in the xfs inode extents at
-	 * this time, intentionally. This change occurs
-	 * on completion of the write operation, in
-	 * xfs_strat_comp(), where the xfs_bmapi() call
-	 * is transactioned, and the extents combined.
-	 */
-	if ((flags & XFS_BMAPI_IGSTATE) && wr)	/* if writing unwritten space */
-		wr = 0;				/* no allocations are allowed */
-	ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
-	logflags = 0;
-	nallocs = 0;
-	cur = NULL;
+
+	XFS_STATS_INC(xs_blk_mapw);
+
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		ASSERT(wr && tp);
-		if ((error = xfs_bmap_local_to_extents(tp, ip,
-				firstblock, total, &logflags, whichfork)))
+		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
+						  &bma.logflags, whichfork);
+		if (error)
 			goto error0;
 	}
-	if (wr && *firstblock == NULLFSBLOCK) {
+
+	if (*firstblock == NULLFSBLOCK) {
 		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-			minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 		else
-			minleft = 1;
-	} else
-		minleft = 0;
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		goto error0;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			bma.minleft = 1;
+	} else {
+		bma.minleft = 0;
+	}
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			goto error0;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+				&bma.prev);
 	n = 0;
 	end = bno + len;
 	obno = bno;
-	bma.ip = NULL;
+
+	bma.tp = tp;
+	bma.ip = ip;
+	bma.total = total;
+	bma.userdata = 0;
+	bma.flist = flist;
+	bma.firstblock = firstblock;
 
 	while (bno < end && n < *nmap) {
-		/*
-		 * Reading past eof, act as though there's a hole
-		 * up to end.
-		 */
-		if (eof && !wr)
-			got.br_startoff = end;
-		inhole = eof || got.br_startoff > bno;
-		wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
-			isnullstartblock(got.br_startblock);
+		inhole = eof || bma.got.br_startoff > bno;
+		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if (wr && (inhole || wasdelay)) {
-			/*
-			 * For the wasdelay case, we could also just
-			 * allocate the stuff asked for in this bmap call
-			 * but that wouldn't be as good.
-			 */
-			if (wasdelay) {
-				alen = (xfs_extlen_t)got.br_blockcount;
-				aoff = got.br_startoff;
-				if (lastx != NULLEXTNUM && lastx) {
-					ep = xfs_iext_get_ext(ifp, lastx - 1);
-					xfs_bmbt_get_all(ep, &prev);
-				}
-			} else {
-				alen = (xfs_extlen_t)
-					XFS_FILBLKS_MIN(len, MAXEXTLEN);
-				if (!eof)
-					alen = (xfs_extlen_t)
-						XFS_FILBLKS_MIN(alen,
-							got.br_startoff - bno);
-				aoff = bno;
-			}
-			minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
-			if (flags & XFS_BMAPI_DELAY) {
-				xfs_extlen_t	extsz;
-
-				/* Figure out the extent size, adjust alen */
-				extsz = xfs_get_extsz_hint(ip);
-				if (extsz) {
-					/*
-					 * make sure we don't exceed a single
-					 * extent length when we align the
-					 * extent by reducing length we are
-					 * going to allocate by the maximum
-					 * amount extent size aligment may
-					 * require.
-					 */
-					alen = XFS_FILBLKS_MIN(len,
-						   MAXEXTLEN - (2 * extsz - 1));
-					error = xfs_bmap_extsize_align(mp,
-							&got, &prev, extsz,
-							rt, eof,
-							flags&XFS_BMAPI_DELAY,
-							flags&XFS_BMAPI_CONVERT,
-							&aoff, &alen);
-					ASSERT(!error);
-				}
-
-				if (rt)
-					extsz = alen / mp->m_sb.sb_rextsize;
-
-				/*
-				 * Make a transaction-less quota reservation for
-				 * delayed allocation blocks. This number gets
-				 * adjusted later.  We return if we haven't
-				 * allocated blocks already inside this loop.
-				 */
-				error = xfs_trans_reserve_quota_nblks(
-						NULL, ip, (long)alen, 0,
-						rt ? XFS_QMOPT_RES_RTBLKS :
-						     XFS_QMOPT_RES_REGBLKS);
-				if (error) {
-					if (n == 0) {
-						*nmap = 0;
-						ASSERT(cur == NULL);
-						return error;
-					}
-					break;
-				}
-
-				/*
-				 * Split changing sb for alen and indlen since
-				 * they could be coming from different places.
-				 */
-				indlen = (xfs_extlen_t)
-					xfs_bmap_worst_indlen(ip, alen);
-				ASSERT(indlen > 0);
-
-				if (rt) {
-					error = xfs_mod_incore_sb(mp,
-							XFS_SBS_FREXTENTS,
-							-((int64_t)extsz), 0);
-				} else {
-					error = xfs_icsb_modify_counters(mp,
-							XFS_SBS_FDBLOCKS,
-							-((int64_t)alen), 0);
-				}
-				if (!error) {
-					error = xfs_icsb_modify_counters(mp,
-							XFS_SBS_FDBLOCKS,
-							-((int64_t)indlen), 0);
-					if (error && rt)
-						xfs_mod_incore_sb(mp,
-							XFS_SBS_FREXTENTS,
-							(int64_t)extsz, 0);
-					else if (error)
-						xfs_icsb_modify_counters(mp,
-							XFS_SBS_FDBLOCKS,
-							(int64_t)alen, 0);
-				}
-
-				if (error) {
-					if (XFS_IS_QUOTA_ON(mp))
-						/* unreserve the blocks now */
-						(void)
-						xfs_trans_unreserve_quota_nblks(
-							NULL, ip,
-							(long)alen, 0, rt ?
-							XFS_QMOPT_RES_RTBLKS :
-							XFS_QMOPT_RES_REGBLKS);
-					break;
-				}
-
-				ip->i_delayed_blks += alen;
-				abno = nullstartblock(indlen);
-			} else {
-				/*
-				 * If first time, allocate and fill in
-				 * once-only bma fields.
-				 */
-				if (bma.ip == NULL) {
-					bma.tp = tp;
-					bma.ip = ip;
-					bma.prevp = &prev;
-					bma.gotp = &got;
-					bma.total = total;
-					bma.userdata = 0;
-				}
-				/* Indicate if this is the first user data
-				 * in the file, or just any user data.
-				 */
-				if (!(flags & XFS_BMAPI_METADATA)) {
-					bma.userdata = (aoff == 0) ?
-						XFS_ALLOC_INITIAL_USER_DATA :
-						XFS_ALLOC_USERDATA;
-				}
-				/*
-				 * Fill in changeable bma fields.
-				 */
-				bma.eof = eof;
-				bma.firstblock = *firstblock;
-				bma.alen = alen;
-				bma.off = aoff;
-				bma.conv = !!(flags & XFS_BMAPI_CONVERT);
-				bma.wasdel = wasdelay;
-				bma.minlen = minlen;
-				bma.low = flist->xbf_low;
-				bma.minleft = minleft;
-				/*
-				 * Only want to do the alignment at the
-				 * eof if it is userdata and allocation length
-				 * is larger than a stripe unit.
-				 */
-				if (mp->m_dalign && alen >= mp->m_dalign &&
-				    (!(flags & XFS_BMAPI_METADATA)) &&
-				    (whichfork == XFS_DATA_FORK)) {
-					if ((error = xfs_bmap_isaeof(ip, aoff,
-							whichfork, &bma.aeof)))
-						goto error0;
-				} else
-					bma.aeof = 0;
-				/*
-				 * Call allocator.
-				 */
-				if ((error = xfs_bmap_alloc(&bma)))
-					goto error0;
-				/*
-				 * Copy out result fields.
-				 */
-				abno = bma.rval;
-				if ((flist->xbf_low = bma.low))
-					minleft = 0;
-				alen = bma.alen;
-				aoff = bma.off;
-				ASSERT(*firstblock == NULLFSBLOCK ||
-				       XFS_FSB_TO_AGNO(mp, *firstblock) ==
-				       XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
-				       (flist->xbf_low &&
-					XFS_FSB_TO_AGNO(mp, *firstblock) <
-					XFS_FSB_TO_AGNO(mp, bma.firstblock)));
-				*firstblock = bma.firstblock;
-				if (cur)
-					cur->bc_private.b.firstblock =
-						*firstblock;
-				if (abno == NULLFSBLOCK)
-					break;
-				if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-					cur = xfs_bmbt_init_cursor(mp, tp,
-						ip, whichfork);
-					cur->bc_private.b.firstblock =
-						*firstblock;
-					cur->bc_private.b.flist = flist;
-				}
-				/*
-				 * Bump the number of extents we've allocated
-				 * in this call.
-				 */
-				nallocs++;
-			}
-			if (cur)
-				cur->bc_private.b.flags =
-					wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
-			got.br_startoff = aoff;
-			got.br_startblock = abno;
-			got.br_blockcount = alen;
-			got.br_state = XFS_EXT_NORM;	/* assume normal */
-			/*
-			 * Determine state of extent, and the filesystem.
-			 * A wasdelay extent has been initialized, so
-			 * shouldn't be flagged as unwritten.
-			 */
-			if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
-					got.br_state = XFS_EXT_UNWRITTEN;
-			}
-			error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got,
-				firstblock, flist, &tmp_logflags,
-				whichfork);
-			logflags |= tmp_logflags;
+		if (inhole || wasdelay) {
+			bma.eof = eof;
+			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+			bma.wasdel = wasdelay;
+			bma.length = len;
+			bma.offset = bno;
+
+			error = xfs_bmapi_allocate(&bma, flags);
 			if (error)
 				goto error0;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-			xfs_bmbt_get_all(ep, &got);
-			ASSERT(got.br_startoff <= aoff);
-			ASSERT(got.br_startoff + got.br_blockcount >=
-				aoff + alen);
-#ifdef DEBUG
-			if (flags & XFS_BMAPI_DELAY) {
-				ASSERT(isnullstartblock(got.br_startblock));
-				ASSERT(startblockval(got.br_startblock) > 0);
-			}
-			ASSERT(got.br_state == XFS_EXT_NORM ||
-			       got.br_state == XFS_EXT_UNWRITTEN);
-#endif
-			/*
-			 * Fall down into the found allocated space case.
-			 */
-		} else if (inhole) {
-			/*
-			 * Reading in a hole.
-			 */
-			mval->br_startoff = bno;
-			mval->br_startblock = HOLESTARTBLOCK;
-			mval->br_blockcount =
-				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
-			mval->br_state = XFS_EXT_NORM;
-			bno += mval->br_blockcount;
-			len -= mval->br_blockcount;
-			mval++;
-			n++;
-			continue;
-		}
-		/*
-		 * Then deal with the allocated space we found.
-		 */
-		ASSERT(ep != NULL);
-		if (!(flags & XFS_BMAPI_ENTIRE) &&
-		    (got.br_startoff + got.br_blockcount > obno)) {
-			if (obno > bno)
-				bno = obno;
-			ASSERT((bno >= obno) || (n == 0));
-			ASSERT(bno < end);
-			mval->br_startoff = bno;
-			if (isnullstartblock(got.br_startblock)) {
-				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
-				mval->br_startblock = DELAYSTARTBLOCK;
-			} else
-				mval->br_startblock =
-					got.br_startblock +
-					(bno - got.br_startoff);
-			/*
-			 * Return the minimum of what we got and what we
-			 * asked for for the length.  We can use the len
-			 * variable here because it is modified below
-			 * and we could have been there before coming
-			 * here if the first part of the allocation
-			 * didn't overlap what was asked for.
-			 */
-			mval->br_blockcount =
-				XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
-					(bno - got.br_startoff));
-			mval->br_state = got.br_state;
-			ASSERT(mval->br_blockcount <= len);
-		} else {
-			*mval = got;
-			if (isnullstartblock(mval->br_startblock)) {
-				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
-				mval->br_startblock = DELAYSTARTBLOCK;
-			}
+			if (bma.blkno == NULLFSBLOCK)
+				break;
 		}
 
-		/*
-		 * Check if writing previously allocated but
-		 * unwritten extents.
-		 */
-		if (wr &&
-		    ((mval->br_state == XFS_EXT_UNWRITTEN &&
-		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
-		     (mval->br_state == XFS_EXT_NORM &&
-		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
-				(XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
-			/*
-			 * Modify (by adding) the state flag, if writing.
-			 */
-			ASSERT(mval->br_blockcount <= len);
-			if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-				cur = xfs_bmbt_init_cursor(mp,
-					tp, ip, whichfork);
-				cur->bc_private.b.firstblock =
-					*firstblock;
-				cur->bc_private.b.flist = flist;
-			}
-			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
-						? XFS_EXT_NORM
-						: XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval,
-				firstblock, flist, &tmp_logflags,
-				whichfork);
-			logflags |= tmp_logflags;
-			if (error)
-				goto error0;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-			xfs_bmbt_get_all(ep, &got);
-			/*
-			 * We may have combined previously unwritten
-			 * space with written space, so generate
-			 * another request.
-			 */
-			if (mval->br_blockcount < len)
-				continue;
-		}
+		/* Deal with the allocated space we found.  */
+		xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+							end, n, flags);
+
+		/* Execute unwritten extent conversion if necessary */
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+		if (error == EAGAIN)
+			continue;
+		if (error)
+			goto error0;
+
+		/* update the extent map to return */
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
 
-		ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-		       ((mval->br_startoff + mval->br_blockcount) <= end));
-		ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-		       (mval->br_blockcount <= len) ||
-		       (mval->br_startoff < obno));
-		bno = mval->br_startoff + mval->br_blockcount;
-		len = end - bno;
-		if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
-			ASSERT(mval->br_startblock == mval[-1].br_startblock);
-			ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
-			ASSERT(mval->br_state == mval[-1].br_state);
-			mval[-1].br_blockcount = mval->br_blockcount;
-			mval[-1].br_state = mval->br_state;
-		} else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
-			   mval[-1].br_startblock != DELAYSTARTBLOCK &&
-			   mval[-1].br_startblock != HOLESTARTBLOCK &&
-			   mval->br_startblock ==
-			   mval[-1].br_startblock + mval[-1].br_blockcount &&
-			   ((flags & XFS_BMAPI_IGSTATE) ||
-				mval[-1].br_state == mval->br_state)) {
-			ASSERT(mval->br_startoff ==
-			       mval[-1].br_startoff + mval[-1].br_blockcount);
-			mval[-1].br_blockcount += mval->br_blockcount;
-		} else if (n > 0 &&
-			   mval->br_startblock == DELAYSTARTBLOCK &&
-			   mval[-1].br_startblock == DELAYSTARTBLOCK &&
-			   mval->br_startoff ==
-			   mval[-1].br_startoff + mval[-1].br_blockcount) {
-			mval[-1].br_blockcount += mval->br_blockcount;
-			mval[-1].br_state = mval->br_state;
-		} else if (!((n == 0) &&
-			     ((mval->br_startoff + mval->br_blockcount) <=
-			      obno))) {
-			mval++;
-			n++;
-		}
 		/*
 		 * If we're done, stop now.  Stop when we've allocated
 		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
 		 * the transaction may get too big.
 		 */
-		if (bno >= end || n >= *nmap || nallocs >= *nmap)
+		if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
 			break;
-		/*
-		 * Else go on to the next record.
-		 */
-		prev = got;
-		if (++lastx < nextents) {
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
-		} else {
+
+		/* Else go on to the next record. */
+		bma.prev = bma.got;
+		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+					 &bma.got);
+		} else
 			eof = 1;
-		}
 	}
 	*nmap = n;
+
 	/*
 	 * Transform from btree to extents, give it cur.
 	 */
-	if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
 	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
-		ASSERT(wr && cur);
-		error = xfs_bmap_btree_to_extents(tp, ip, cur,
+		int		tmp_logflags = 0;
+
+		ASSERT(bma.cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
 			&tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
+		bma.logflags |= tmp_logflags;
 		if (error)
 			goto error0;
 	}
@@ -4855,34 +4984,33 @@ error0:
 	 * Log everything.  Do this after conversion, there's no point in
 	 * logging the extent records if we've converted to btree format.
 	 */
-	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~xfs_ilog_fext(whichfork);
-	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		bma.logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
 		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~xfs_ilog_fbroot(whichfork);
+		bma.logflags &= ~xfs_ilog_fbroot(whichfork);
 	/*
 	 * Log whatever the flags say, even if error.  Otherwise we might miss
 	 * detecting a case where the data is changed, there's an error,
 	 * and it's not logged so we don't shutdown when we should.
 	 */
-	if (logflags) {
-		ASSERT(tp && wr);
-		xfs_trans_log_inode(tp, ip, logflags);
-	}
-	if (cur) {
+	if (bma.logflags)
+		xfs_trans_log_inode(tp, ip, bma.logflags);
+
+	if (bma.cur) {
 		if (!error) {
 			ASSERT(*firstblock == NULLFSBLOCK ||
 			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
 			       XFS_FSB_TO_AGNO(mp,
-				       cur->bc_private.b.firstblock) ||
+				       bma.cur->bc_private.b.firstblock) ||
 			       (flist->xbf_low &&
 				XFS_FSB_TO_AGNO(mp, *firstblock) <
 				XFS_FSB_TO_AGNO(mp,
-					cur->bc_private.b.firstblock)));
-			*firstblock = cur->bc_private.b.firstblock;
+					bma.cur->bc_private.b.firstblock)));
+			*firstblock = bma.cur->bc_private.b.firstblock;
 		}
-		xfs_btree_del_cursor(cur,
+		xfs_btree_del_cursor(bma.cur,
 			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
 	}
 	if (!error)
@@ -4892,58 +5020,6 @@ error0:
 }
 
 /*
- * Map file blocks to filesystem blocks, simple version.
- * One block (extent) only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int						/* error */
-xfs_bmapi_single(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode */
-	int		whichfork,	/* data or attr fork */
-	xfs_fsblock_t	*fsb,		/* output: mapped block */
-	xfs_fileoff_t	bno)		/* starting file offs. mapped */
-{
-	int		eof;		/* we've hit the end of extents */
-	int		error;		/* error return */
-	xfs_bmbt_irec_t	got;		/* current file extent record */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_extnum_t	lastx;		/* last useful extent number */
-	xfs_bmbt_irec_t	prev;		/* previous file extent record */
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (unlikely(
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
-	       XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
-				ip->i_mount);
-	       return XFS_ERROR(EFSCORRUPTED);
-	}
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return XFS_ERROR(EIO);
-	XFS_STATS_INC(xs_blk_mapr);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	(void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	/*
-	 * Reading past eof, act as though there's a hole
-	 * up to end.
-	 */
-	if (eof || got.br_startoff > bno) {
-		*fsb = NULLFSBLOCK;
-		return 0;
-	}
-	ASSERT(!isnullstartblock(got.br_startblock));
-	ASSERT(bno < got.br_startoff + got.br_blockcount);
-	*fsb = got.br_startblock + (bno - got.br_startoff);
-	return 0;
-}
-
-/*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
  * that value.  If not all extents in the block range can be removed then
@@ -5114,9 +5190,9 @@ xfs_bunmapi(
 				del.br_blockcount = mod;
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del,
-				firstblock, flist, &logflags,
-				XFS_DATA_FORK);
+			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+					&lastx, &cur, &del, firstblock, flist,
+					&logflags);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5172,18 +5248,18 @@ xfs_bunmapi(
 				}
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				lastx--;
-				error = xfs_bmap_add_extent(tp, ip, &lastx,
-						&cur, &prev, firstblock, flist,
-						&logflags, XFS_DATA_FORK);
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &prev,
+						firstblock, flist, &logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
 			} else {
 				ASSERT(del.br_state == XFS_EXT_NORM);
 				del.br_state = XFS_EXT_UNWRITTEN;
-				error = xfs_bmap_add_extent(tp, ip, &lastx,
-						&cur, &del, firstblock, flist,
-						&logflags, XFS_DATA_FORK);
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &del,
+						firstblock, flist, &logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5505,10 +5581,9 @@ xfs_getbmap(
 
 	do {
 		nmap = (nexleft > subnex) ? subnex : nexleft;
-		error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
-				  XFS_BB_TO_FSB(mp, bmv->bmv_length),
-				  bmapi_flags, NULL, 0, map, &nmap,
-				  NULL);
+		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
+				       map, &nmap, bmapi_flags);
 		if (error)
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
@@ -5582,89 +5657,6 @@ xfs_getbmap(
 	return error;
 }
 
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int				/* error */
-xfs_bmap_isaeof(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t   off,		/* file offset in fsblocks */
-	int             whichfork,	/* data or attribute fork */
-	char		*aeof)		/* return value */
-{
-	int		error;		/* error return value */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t *lastrec;	/* extent record pointer */
-	xfs_extnum_t	nextents;	/* number of file extents */
-	xfs_bmbt_irec_t	s;		/* expanded extent record */
-
-	ASSERT(whichfork == XFS_DATA_FORK);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(NULL, ip, whichfork)))
-		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*aeof = 1;
-		return 0;
-	}
-	/*
-	 * Go to the last extent
-	 */
-	lastrec = xfs_iext_get_ext(ifp, nextents - 1);
-	xfs_bmbt_get_all(lastrec, &s);
-	/*
-	 * Check we are allocating in the last extent (for delayed allocations)
-	 * or past the last extent for non-delayed allocations.
-	 */
-	*aeof = (off >= s.br_startoff &&
-		 off < s.br_startoff + s.br_blockcount &&
-		 isnullstartblock(s.br_startblock)) ||
-		off >= s.br_startoff + s.br_blockcount;
-	return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.
- */
-int					/* error */
-xfs_bmap_eof(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t	endoff,		/* file offset in fsblocks */
-	int		whichfork,	/* data or attribute fork */
-	int		*eof)		/* result value */
-{
-	xfs_fsblock_t	blockcount;	/* extent block count */
-	int		error;		/* error return value */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t *lastrec;	/* extent record pointer */
-	xfs_extnum_t	nextents;	/* number of file extents */
-	xfs_fileoff_t	startoff;	/* extent starting file offset */
-
-	ASSERT(whichfork == XFS_DATA_FORK);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(NULL, ip, whichfork)))
-		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*eof = 1;
-		return 0;
-	}
-	/*
-	 * Go to the last extent
-	 */
-	lastrec = xfs_iext_get_ext(ifp, nextents - 1);
-	startoff = xfs_bmbt_get_startoff(lastrec);
-	blockcount = xfs_bmbt_get_blockcount(lastrec);
-	*eof = endoff >= startoff + blockcount;
-	return 0;
-}
-
 #ifdef DEBUG
 STATIC struct xfs_buf *
 xfs_bmap_get_bp(
@@ -6099,9 +6091,8 @@ xfs_bmap_punch_delalloc_range(
 		 * trying to remove a real extent (which requires a
 		 * transaction) or a hole, which is probably a bad idea...
 		 */
-		error = xfs_bmapi(NULL, ip, start_fsb, 1,
-				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-				&nimaps, NULL);
+		error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+				       XFS_BMAPI_ENTIRE);
 
 		if (error) {
 			/* something screwed, just bail */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c62234bde05..89ee672d378 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,27 +62,23 @@ typedef	struct xfs_bmap_free
 #define	XFS_BMAP_MAX_NMAP	4
 
 /*
- * Flags for xfs_bmapi
+ * Flags for xfs_bmapi_*
  */
-#define	XFS_BMAPI_WRITE		0x001	/* write operation: allocate space */
-#define XFS_BMAPI_DELAY		0x002	/* delayed write operation */
-#define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
-#define	XFS_BMAPI_PREALLOC	0x040	/* preallocation op: unwritten space */
-#define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
+#define XFS_BMAPI_ENTIRE	0x001	/* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA	0x002	/* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK	0x004	/* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
 					/* combine contig. space */
-#define	XFS_BMAPI_CONTIG	0x100	/* must allocate only one extent */
+#define XFS_BMAPI_CONTIG	0x020	/* must allocate only one extent */
 /*
  * unwritten extent conversion - this needs write cache flushing and no additional
  * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
  * from written to unwritten, otherwise convert from unwritten to written.
  */
-#define XFS_BMAPI_CONVERT	0x200
+#define XFS_BMAPI_CONVERT	0x040
 
 #define XFS_BMAPI_FLAGS \
-	{ XFS_BMAPI_WRITE,	"WRITE" }, \
-	{ XFS_BMAPI_DELAY,	"DELAY" }, \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
@@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
  * Argument structure for xfs_bmap_alloc.
  */
 typedef struct xfs_bmalloca {
-	xfs_fsblock_t		firstblock; /* i/o first block allocated */
-	xfs_fsblock_t		rval;	/* starting block of new extent */
-	xfs_fileoff_t		off;	/* offset in file filling in */
+	xfs_fsblock_t		*firstblock; /* i/o first block allocated */
+	struct xfs_bmap_free	*flist;	/* bmap freelist */
 	struct xfs_trans	*tp;	/* transaction pointer */
 	struct xfs_inode	*ip;	/* incore inode pointer */
-	struct xfs_bmbt_irec	*prevp;	/* extent before the new one */
-	struct xfs_bmbt_irec	*gotp;	/* extent after, or delayed */
-	xfs_extlen_t		alen;	/* i/o length asked/allocated */
+	struct xfs_bmbt_irec	prev;	/* extent before the new one */
+	struct xfs_bmbt_irec	got;	/* extent after, or delayed */
+
+	xfs_fileoff_t		offset;	/* offset in file filling in */
+	xfs_extlen_t		length;	/* i/o length asked/allocated */
+	xfs_fsblock_t		blkno;	/* starting block of new extent */
+
+	struct xfs_btree_cur	*cur;	/* btree cursor */
+	xfs_extnum_t		idx;	/* current extent index */
+	int			nallocs;/* number of extents alloc'd */
+	int			logflags;/* flags for transaction logging */
+
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
 	xfs_extlen_t		minleft; /* amount must be left after alloc */
 	char			eof;	/* set if allocating past last extent */
 	char			wasdel;	/* replacing a delayed allocation */
 	char			userdata;/* set if is user data */
-	char			low;	/* low on space, using seq'l ags */
 	char			aeof;	/* allocated space at eof */
 	char			conv;	/* overwriting unwritten extents */
 } xfs_bmalloca_t;
@@ -152,251 +155,62 @@ typedef struct xfs_bmalloca {
 	{ BMAP_RIGHT_FILLING,	"RF" }, \
 	{ BMAP_ATTRFORK,	"ATTR" }
 
-/*
- * Add bmap trace insert entries for all the contents of the extent list.
- *
- * Quite excessive tracing.  Only do this for debug builds.
- */
 #if defined(__KERNEL) && defined(DEBUG)
-void
-xfs_bmap_trace_exlist(
-	struct xfs_inode	*ip,		/* incore inode pointer */
-	xfs_extnum_t		cnt,		/* count of entries in list */
-	int			whichfork,
-	unsigned long		caller_ip);	/* data or attr fork */
+void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+		int whichfork, unsigned long caller_ip);
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
 	xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
 #else
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
 
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int					/* error code */
-xfs_bmap_add_attrfork(
-	struct xfs_inode	*ip,	/* incore inode pointer */
-	int			size,	/* space needed for new attribute */
-	int			rsvd);	/* flag for reserved block allocation */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-xfs_bmap_add_free(
-	xfs_fsblock_t		bno,		/* fs block number of extent */
-	xfs_filblks_t		len,		/* length of extent */
-	xfs_bmap_free_t		*flist,		/* list of extents */
-	struct xfs_mount	*mp);		/* mount point structure */
-
-/*
- * Routine to clean up the free list data structure when
- * an error occurs during a transaction.
- */
-void
-xfs_bmap_cancel(
-	xfs_bmap_free_t		*flist);	/* free list to clean up */
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem.  Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
-	struct xfs_mount	*mp,	/* file system mount structure */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first unused block in the file.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- */
-int						/* error */
-xfs_bmap_first_unused(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_extlen_t		len,		/* size of hole to find */
-	xfs_fileoff_t		*unused,	/* unused block num */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int						/* error */
-xfs_bmap_last_before(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		*last_block,	/* last block */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file.  This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int						/* error */
-xfs_bmap_last_offset(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		*unused,	/* last block num */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not.  For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int
-xfs_bmap_one_block(
-	struct xfs_inode	*ip,		/* incore inode */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Read in the extents to iu_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in.
- */
-int						/* error */
-xfs_bmap_read_extents(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int						/* error */
-xfs_bmapi(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		bno,		/* starting file offs. mapped */
-	xfs_filblks_t		len,		/* length to map in file */
-	int			flags,		/* XFS_BMAPI_... */
-	xfs_fsblock_t		*firstblock,	/* first allocated block
-						   controls a.g. for allocs */
-	xfs_extlen_t		total,		/* total blocks needed */
-	struct xfs_bmbt_irec	*mval,		/* output: map values */
-	int			*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
-
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int						/* error */
-xfs_bmapi_single(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	int			whichfork,	/* data or attr fork */
-	xfs_fsblock_t		*fsb,		/* output: mapped block */
-	xfs_fileoff_t		bno);		/* starting file offs. mapped */
-
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value.  If not all extents in the block range can be removed then
- * *done is set.
- */
-int						/* error */
-xfs_bunmapi(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		bno,		/* starting offset to unmap */
-	xfs_filblks_t		len,		/* length to unmap in file */
-	int			flags,		/* XFS_BMAPI_... */
-	xfs_extnum_t		nexts,		/* number of extents max */
-	xfs_fsblock_t		*firstblock,	/* first allocated block
-						   controls a.g. for allocs */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	int			*done);		/* set if not done yet */
-
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-	struct xfs_ifork	*ifp,
-	xfs_extnum_t		idx,
-	xfs_extnum_t		num);
-
-uint
-xfs_default_attroffset(
-	struct xfs_inode	*ip);
+int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void	xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+		struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void	xfs_bmap_cancel(struct xfs_bmap_free *flist);
+void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int	xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *last_block, int whichfork);
+int	xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t *unused, int whichfork);
+int	xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int	xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork);
+int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, int flags);
+int	xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+		int *nmap, int flags);
+int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+		xfs_fsblock_t *firstblock, xfs_extlen_t total,
+		struct xfs_bmbt_irec *mval, int *nmap,
+		struct xfs_bmap_free *flist);
+int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+		struct xfs_bmap_free *flist, int *done);
+int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+		xfs_extnum_t num);
+uint	xfs_default_attroffset(struct xfs_inode *ip);
 
 #ifdef __KERNEL__
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int						/* error */
-xfs_bmap_finish(
-	struct xfs_trans	**tp,		/* transaction pointer addr */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	int			*committed);	/* xact committed or not */
-
 /* bmap to userspace formatter - copy to user & advance pointer */
 typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
 
-/*
- * Get inode's extents as described in bmv, and format for output.
- */
-int						/* error code */
-xfs_getbmap(
-	xfs_inode_t		*ip,
-	struct getbmapx		*bmv,		/* user bmap structure */
-	xfs_bmap_format_t	formatter,	/* format to user */
-	void			*arg);		/* formatter arg */
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary
- */
-int
-xfs_bmap_eof(
-	struct xfs_inode        *ip,
-	xfs_fileoff_t           endoff,
-	int                     whichfork,
-	int                     *eof);
-
-/*
- * Count fsblocks of the given fork.
- */
-int
-xfs_bmap_count_blocks(
-	xfs_trans_t		*tp,
-	struct xfs_inode	*ip,
-	int			whichfork,
-	int			*count);
-
-int
-xfs_bmap_punch_delalloc_range(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		start_fsb,
-	xfs_fileoff_t		length);
+int	xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+		int *committed);
+int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+		xfs_bmap_format_t formatter, void *arg);
+int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+		int whichfork, int *eof);
+int	xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+		int whichfork, int *count);
+int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2b9fd385e27..1f19f03af9d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -631,7 +631,7 @@ xfs_btree_read_bufl(
 	}
 	ASSERT(!xfs_buf_geterror(bp));
 	if (bp)
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+		xfs_buf_set_ref(bp, refval);
 	*bpp = bp;
 	return 0;
 }
@@ -939,13 +939,13 @@ xfs_btree_set_refs(
 	switch (cur->bc_btnum) {
 	case XFS_BTNUM_BNO:
 	case XFS_BTNUM_CNT:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
 		break;
 	case XFS_BTNUM_INO:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
 		break;
 	case XFS_BTNUM_BMAP:
-		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
 		break;
 	default:
 		ASSERT(0);
@@ -970,7 +970,8 @@ xfs_btree_get_buf_block(
 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
 				 mp->m_bsize, flags);
 
-	ASSERT(!xfs_buf_geterror(*bpp));
+	if (!*bpp)
+		return ENOMEM;
 
 	*block = XFS_BUF_TO_BLOCK(*bpp);
 	return 0;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c57836dc778..cf0ac056815 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue;
 #define xb_to_km(flags) \
 	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
 
-#define xfs_buf_allocate(flags) \
-	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
-	kmem_zone_free(xfs_buf_zone, (bp));
 
 static inline int
 xfs_buf_is_vmapped(
@@ -152,6 +147,7 @@ xfs_buf_stale(
 	struct xfs_buf	*bp)
 {
 	bp->b_flags |= XBF_STALE;
+	xfs_buf_delwri_dequeue(bp);
 	atomic_set(&(bp)->b_lru_ref, 0);
 	if (!list_empty(&bp->b_lru)) {
 		struct xfs_buftarg *btp = bp->b_target;
@@ -167,14 +163,19 @@ xfs_buf_stale(
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
 }
 
-STATIC void
-_xfs_buf_initialize(
-	xfs_buf_t		*bp,
-	xfs_buftarg_t		*target,
+struct xfs_buf *
+xfs_buf_alloc(
+	struct xfs_buftarg	*target,
 	xfs_off_t		range_base,
 	size_t			range_length,
 	xfs_buf_flags_t		flags)
 {
+	struct xfs_buf		*bp;
+
+	bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
+	if (unlikely(!bp))
+		return NULL;
+
 	/*
 	 * We don't want certain flags to appear in b_flags.
 	 */
@@ -203,8 +204,9 @@ _xfs_buf_initialize(
 	init_waitqueue_head(&bp->b_waiters);
 
 	XFS_STATS_INC(xb_create);
-
 	trace_xfs_buf_init(bp, _RET_IP_);
+
+	return bp;
 }
 
 /*
@@ -277,7 +279,7 @@ xfs_buf_free(
 	} else if (bp->b_flags & _XBF_KMEM)
 		kmem_free(bp->b_addr);
 	_xfs_buf_free_pages(bp);
-	xfs_buf_deallocate(bp);
+	kmem_zone_free(xfs_buf_zone, bp);
 }
 
 /*
@@ -416,10 +418,7 @@ _xfs_buf_map_pages(
 /*
  *	Look up, and creates if absent, a lockable buffer for
  *	a given range of an inode.  The buffer is returned
- *	locked.	 If other overlapping buffers exist, they are
- *	released before the new buffer is created and locked,
- *	which may imply that this call will block until those buffers
- *	are unlocked.  No I/O is implied by this call.
+ *	locked.	No I/O is implied by this call.
  */
 xfs_buf_t *
 _xfs_buf_find(
@@ -481,8 +480,6 @@ _xfs_buf_find(
 
 	/* No match found */
 	if (new_bp) {
-		_xfs_buf_initialize(new_bp, btp, range_base,
-				range_length, flags);
 		rb_link_node(&new_bp->b_rbnode, parent, rbp);
 		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 		/* the buffer keeps the perag reference until it is freed */
@@ -525,35 +522,51 @@ found:
 }
 
 /*
- *	Assembles a buffer covering the specified range.
- *	Storage in memory for all portions of the buffer will be allocated,
- *	although backing storage may not be.
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
  */
-xfs_buf_t *
+struct xfs_buf *
 xfs_buf_get(
 	xfs_buftarg_t		*target,/* target for buffer		*/
 	xfs_off_t		ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
 	xfs_buf_flags_t		flags)
 {
-	xfs_buf_t		*bp, *new_bp;
+	struct xfs_buf		*bp;
+	struct xfs_buf		*new_bp;
 	int			error = 0;
 
-	new_bp = xfs_buf_allocate(flags);
+	bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
+	if (likely(bp))
+		goto found;
+
+	new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
+			       flags);
 	if (unlikely(!new_bp))
 		return NULL;
 
 	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+	if (!bp) {
+		kmem_zone_free(xfs_buf_zone, new_bp);
+		return NULL;
+	}
+
 	if (bp == new_bp) {
 		error = xfs_buf_allocate_memory(bp, flags);
 		if (error)
 			goto no_buffer;
-	} else {
-		xfs_buf_deallocate(new_bp);
-		if (unlikely(bp == NULL))
-			return NULL;
-	}
+	} else
+		kmem_zone_free(xfs_buf_zone, new_bp);
 
+	/*
+	 * Now we have a workable buffer, fill in the block number so
+	 * that we can do IO on it.
+	 */
+	bp->b_bn = ioff;
+	bp->b_count_desired = bp->b_buffer_length;
+
+found:
 	if (!(bp->b_flags & XBF_MAPPED)) {
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
@@ -564,18 +577,10 @@ xfs_buf_get(
 	}
 
 	XFS_STATS_INC(xb_get);
-
-	/*
-	 * Always fill in the block number now, the mapped cases can do
-	 * their own overlay of this later.
-	 */
-	bp->b_bn = ioff;
-	bp->b_count_desired = bp->b_buffer_length;
-
 	trace_xfs_buf_get(bp, flags, _RET_IP_);
 	return bp;
 
- no_buffer:
+no_buffer:
 	if (flags & (XBF_LOCK | XBF_TRYLOCK))
 		xfs_buf_unlock(bp);
 	xfs_buf_rele(bp);
@@ -689,19 +694,6 @@ xfs_buf_read_uncached(
 	return bp;
 }
 
-xfs_buf_t *
-xfs_buf_get_empty(
-	size_t			len,
-	xfs_buftarg_t		*target)
-{
-	xfs_buf_t		*bp;
-
-	bp = xfs_buf_allocate(0);
-	if (bp)
-		_xfs_buf_initialize(bp, target, 0, len, 0);
-	return bp;
-}
-
 /*
  * Return a buffer allocated as an empty buffer and associated to external
  * memory via xfs_buf_associate_memory() back to it's empty state.
@@ -787,10 +779,9 @@ xfs_buf_get_uncached(
 	int			error, i;
 	xfs_buf_t		*bp;
 
-	bp = xfs_buf_allocate(0);
+	bp = xfs_buf_alloc(target, 0, len, 0);
 	if (unlikely(bp == NULL))
 		goto fail;
-	_xfs_buf_initialize(bp, target, 0, len, 0);
 
 	error = _xfs_buf_get_pages(bp, page_count, 0);
 	if (error)
@@ -818,7 +809,7 @@ xfs_buf_get_uncached(
 		__free_page(bp->b_pages[i]);
 	_xfs_buf_free_pages(bp);
  fail_free_buf:
-	xfs_buf_deallocate(bp);
+	kmem_zone_free(xfs_buf_zone, bp);
  fail:
 	return NULL;
 }
@@ -937,12 +928,6 @@ void
 xfs_buf_unlock(
 	struct xfs_buf		*bp)
 {
-	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
-		atomic_inc(&bp->b_hold);
-		bp->b_flags |= XBF_ASYNC;
-		xfs_buf_delwri_queue(bp, 0);
-	}
-
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
 
@@ -1019,9 +1004,19 @@ xfs_buf_ioerror(
 	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 
+void
+xfs_buf_ioerror_alert(
+	struct xfs_buf		*bp,
+	const char		*func)
+{
+	xfs_alert(bp->b_target->bt_mount,
+"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
+		(__uint64_t)XFS_BUF_ADDR(bp), func,
+		bp->b_error, XFS_BUF_COUNT(bp));
+}
+
 int
 xfs_bwrite(
-	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
 	int			error;
@@ -1033,25 +1028,13 @@ xfs_bwrite(
 	xfs_bdstrat_cb(bp);
 
 	error = xfs_buf_iowait(bp);
-	if (error)
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	xfs_buf_relse(bp);
+	if (error) {
+		xfs_force_shutdown(bp->b_target->bt_mount,
+				   SHUTDOWN_META_IO_ERROR);
+	}
 	return error;
 }
 
-void
-xfs_bdwrite(
-	void			*mp,
-	struct xfs_buf		*bp)
-{
-	trace_xfs_buf_bdwrite(bp, _RET_IP_);
-
-	bp->b_flags &= ~XBF_READ;
-	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-
-	xfs_buf_delwri_queue(bp, 1);
-}
-
 /*
  * Called when we want to stop a buffer from getting written or read.
  * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1074,9 +1057,8 @@ xfs_bioerror(
 	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
 	 */
 	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_UNDONE(bp);
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 
 	xfs_buf_ioend(bp, 0);
 
@@ -1103,9 +1085,8 @@ xfs_bioerror_relse(
 	 * change that interface.
 	 */
 	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_DONE(bp);
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 	bp->b_iodone = NULL;
 	if (!(fl & XBF_ASYNC)) {
 		/*
@@ -1115,7 +1096,7 @@ xfs_bioerror_relse(
 		 * ASYNC buffers.
 		 */
 		xfs_buf_ioerror(bp, EIO);
-		XFS_BUF_FINISH_IOWAIT(bp);
+		complete(&bp->b_iowait);
 	} else {
 		xfs_buf_relse(bp);
 	}
@@ -1275,15 +1256,10 @@ xfs_buf_iorequest(
 {
 	trace_xfs_buf_iorequest(bp, _RET_IP_);
 
-	if (bp->b_flags & XBF_DELWRI) {
-		xfs_buf_delwri_queue(bp, 1);
-		return 0;
-	}
+	ASSERT(!(bp->b_flags & XBF_DELWRI));
 
-	if (bp->b_flags & XBF_WRITE) {
+	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
-	}
-
 	xfs_buf_hold(bp);
 
 	/* Set the count to 1 initially, this will stop an I/O
@@ -1481,9 +1457,13 @@ xfs_setsize_buftarg_flags(
 	btp->bt_smask = sectorsize - 1;
 
 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
+		char name[BDEVNAME_SIZE];
+
+		bdevname(btp->bt_bdev, name);
+
 		xfs_warn(btp->bt_mount,
 			"Cannot set_blocksize to %u on device %s\n",
-			sectorsize, xfs_buf_target_name(btp));
+			sectorsize, name);
 		return EINVAL;
 	}
 
@@ -1514,12 +1494,12 @@ xfs_setsize_buftarg(
 }
 
 STATIC int
-xfs_alloc_delwrite_queue(
+xfs_alloc_delwri_queue(
 	xfs_buftarg_t		*btp,
 	const char		*fsname)
 {
-	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
-	spin_lock_init(&btp->bt_delwrite_lock);
+	INIT_LIST_HEAD(&btp->bt_delwri_queue);
+	spin_lock_init(&btp->bt_delwri_lock);
 	btp->bt_flags = 0;
 	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
 	if (IS_ERR(btp->bt_task))
@@ -1549,7 +1529,7 @@ xfs_alloc_buftarg(
 	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
-	if (xfs_alloc_delwrite_queue(btp, fsname))
+	if (xfs_alloc_delwri_queue(btp, fsname))
 		goto error;
 	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1565,56 +1545,48 @@ error:
 /*
  *	Delayed write buffer handling
  */
-STATIC void
+void
 xfs_buf_delwri_queue(
-	xfs_buf_t		*bp,
-	int			unlock)
+	xfs_buf_t		*bp)
 {
-	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
+	struct xfs_buftarg	*btp = bp->b_target;
 
 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
 
-	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+	ASSERT(!(bp->b_flags & XBF_READ));
 
-	spin_lock(dwlk);
-	/* If already in the queue, dequeue and place at tail */
+	spin_lock(&btp->bt_delwri_lock);
 	if (!list_empty(&bp->b_list)) {
+		/* if already in the queue, move it to the tail */
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		if (unlock)
-			atomic_dec(&bp->b_hold);
-		list_del(&bp->b_list);
-	}
-
-	if (list_empty(dwq)) {
+		list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
+	} else {
 		/* start xfsbufd as it is about to have something to do */
-		wake_up_process(bp->b_target->bt_task);
-	}
+		if (list_empty(&btp->bt_delwri_queue))
+			wake_up_process(bp->b_target->bt_task);
 
-	bp->b_flags |= _XBF_DELWRI_Q;
-	list_add_tail(&bp->b_list, dwq);
+		atomic_inc(&bp->b_hold);
+		bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
+		list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
+	}
 	bp->b_queuetime = jiffies;
-	spin_unlock(dwlk);
-
-	if (unlock)
-		xfs_buf_unlock(bp);
+	spin_unlock(&btp->bt_delwri_lock);
 }
 
 void
 xfs_buf_delwri_dequeue(
 	xfs_buf_t		*bp)
 {
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
 	int			dequeued = 0;
 
-	spin_lock(dwlk);
+	spin_lock(&bp->b_target->bt_delwri_lock);
 	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
 		list_del_init(&bp->b_list);
 		dequeued = 1;
 	}
 	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-	spin_unlock(dwlk);
+	spin_unlock(&bp->b_target->bt_delwri_lock);
 
 	if (dequeued)
 		xfs_buf_rele(bp);
@@ -1646,16 +1618,9 @@ xfs_buf_delwri_promote(
 	if (bp->b_queuetime < jiffies - age)
 		return;
 	bp->b_queuetime = jiffies - age;
-	spin_lock(&btp->bt_delwrite_lock);
-	list_move(&bp->b_list, &btp->bt_delwrite_queue);
-	spin_unlock(&btp->bt_delwrite_lock);
-}
-
-STATIC void
-xfs_buf_runall_queues(
-	struct workqueue_struct	*queue)
-{
-	flush_workqueue(queue);
+	spin_lock(&btp->bt_delwri_lock);
+	list_move(&bp->b_list, &btp->bt_delwri_queue);
+	spin_unlock(&btp->bt_delwri_lock);
 }
 
 /*
@@ -1669,15 +1634,13 @@ xfs_buf_delwri_split(
 	unsigned long	age)
 {
 	xfs_buf_t	*bp, *n;
-	struct list_head *dwq = &target->bt_delwrite_queue;
-	spinlock_t	*dwlk = &target->bt_delwrite_lock;
 	int		skipped = 0;
 	int		force;
 
 	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	INIT_LIST_HEAD(list);
-	spin_lock(dwlk);
-	list_for_each_entry_safe(bp, n, dwq, b_list) {
+	spin_lock(&target->bt_delwri_lock);
+	list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
 		if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
@@ -1694,10 +1657,9 @@ xfs_buf_delwri_split(
 		} else
 			skipped++;
 	}
-	spin_unlock(dwlk);
 
+	spin_unlock(&target->bt_delwri_lock);
 	return skipped;
-
 }
 
 /*
@@ -1747,7 +1709,7 @@ xfsbufd(
 		}
 
 		/* sleep for a long time if there is nothing to do. */
-		if (list_empty(&target->bt_delwrite_queue))
+		if (list_empty(&target->bt_delwri_queue))
 			tout = MAX_SCHEDULE_TIMEOUT;
 		schedule_timeout_interruptible(tout);
 
@@ -1783,9 +1745,7 @@ xfs_flush_buftarg(
 	LIST_HEAD(wait_list);
 	struct blk_plug plug;
 
-	xfs_buf_runall_queues(xfsconvertd_workqueue);
-	xfs_buf_runall_queues(xfsdatad_workqueue);
-	xfs_buf_runall_queues(xfslogd_workqueue);
+	flush_workqueue(xfslogd_workqueue);
 
 	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
@@ -1866,11 +1826,3 @@ xfs_buf_terminate(void)
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(xfs_buf_zone);
 }
-
-#ifdef CONFIG_KDB_MODULES
-struct list_head *
-xfs_get_buftarg_list(void)
-{
-	return &xfs_buftarg_list;
-}
-#endif
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 620972b8094..5bab046e859 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
 
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
-	struct list_head	bt_delwrite_queue;
-	spinlock_t		bt_delwrite_lock;
+	struct list_head	bt_delwri_queue;
+	spinlock_t		bt_delwri_lock;
 	unsigned long		bt_flags;
 
 	/* LRU control structures */
@@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
 extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
+			      xfs_buf_flags_t);
 extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
@@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 	((bp)->b_sema.count <= 0)
 
 /* Buffer Read and Write Routines */
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+extern int xfs_bwrite(struct xfs_buf *bp);
 
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
@@ -221,38 +222,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 
 /* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-extern void xfs_buf_delwri_promote(xfs_buf_t *);
+extern void xfs_buf_delwri_queue(struct xfs_buf *);
+extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
+extern void xfs_buf_delwri_promote(struct xfs_buf *);
 
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
 
-static inline const char *
-xfs_buf_target_name(struct xfs_buftarg *target)
-{
-	static char __b[BDEVNAME_SIZE];
-
-	return bdevname(target->bt_bdev, __b);
-}
-
-
 #define XFS_BUF_ZEROFLAGS(bp) \
 	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
 			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
 
 void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_STALE(bp)	xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_SUPER_STALE(bp)	do {				\
-					XFS_BUF_STALE(bp);	\
-					xfs_buf_delwri_dequeue(bp);	\
-					XFS_BUF_DONE(bp);	\
-				} while (0)
-
-#define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
+
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
 
 #define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
@@ -280,23 +265,16 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
 
-static inline void
-xfs_buf_set_ref(
-	struct xfs_buf	*bp,
-	int		lru_ref)
+static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 {
 	atomic_set(&bp->b_lru_ref, lru_ref);
 }
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	xfs_buf_set_ref(bp, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
 
 static inline int xfs_buf_ispinned(struct xfs_buf *bp)
 {
 	return atomic_read(&bp->b_pin_count);
 }
 
-#define XFS_BUF_FINISH_IOWAIT(bp)	complete(&bp->b_iowait);
-
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
 	xfs_buf_unlock(bp);
@@ -313,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
 
-#ifdef CONFIG_KDB_MODULES
-extern struct list_head *xfs_get_buftarg_list(void);
-#endif
-
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
-#define xfs_binval(buftarg)		xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
-
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cac2ecfa674..1a3513881bc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -629,7 +629,7 @@ xfs_buf_item_push(
  * the xfsbufd to get this buffer written. We have to unlock the buffer
  * to allow the xfsbufd to write it, too.
  */
-STATIC void
+STATIC bool
 xfs_buf_item_pushbuf(
 	struct xfs_log_item	*lip)
 {
@@ -643,6 +643,7 @@ xfs_buf_item_pushbuf(
 
 	xfs_buf_delwri_promote(bp);
 	xfs_buf_relse(bp);
+	return true;
 }
 
 STATIC void
@@ -966,7 +967,8 @@ xfs_buf_iodone_callbacks(
 	 * I/O errors, there's no point in giving this a retry.
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
-		XFS_BUF_SUPER_STALE(bp);
+		xfs_buf_stale(bp);
+		XFS_BUF_DONE(bp);
 		trace_xfs_buf_item_iodone(bp, _RET_IP_);
 		goto do_callbacks;
 	}
@@ -974,9 +976,7 @@ xfs_buf_iodone_callbacks(
 	if (bp->b_target != lasttarg ||
 	    time_after(jiffies, (lasttime + 5*HZ))) {
 		lasttime = jiffies;
-		xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-			xfs_buf_target_name(bp->b_target),
-		      (__uint64_t)XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 	}
 	lasttarg = bp->b_target;
 
@@ -992,7 +992,7 @@ xfs_buf_iodone_callbacks(
 		xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
 
 		if (!XFS_BUF_ISSTALE(bp)) {
-			XFS_BUF_DELAYWRITE(bp);
+			xfs_buf_delwri_queue(bp);
 			XFS_BUF_DONE(bp);
 		}
 		ASSERT(bp->b_iodone != NULL);
@@ -1005,9 +1005,8 @@ xfs_buf_iodone_callbacks(
 	 * If the write of the buffer was synchronous, we want to make
 	 * sure to return the error to the caller of xfs_bwrite().
 	 */
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 	XFS_BUF_DONE(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
 
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index ee9d5427fcd..77c74257c2a 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int(
 	 */
 	nmap = 1;
 	ASSERT(args->firstblock != NULL);
-	error = xfs_bmapi(tp, dp, *bno, count,
-			xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
-			XFS_BMAPI_CONTIG,
+	error = xfs_bmapi_write(tp, dp, *bno, count,
+			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
 			args->flist);
 	if (error)
@@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int(
 		for (b = *bno, mapi = 0; b < *bno + count; ) {
 			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
 			c = (int)(*bno + count - b);
-			error = xfs_bmapi(tp, dp, b, c,
-					xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
-					XFS_BMAPI_METADATA,
+			error = xfs_bmapi_write(tp, dp, b, c,
+					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
 					&mapp[mapi], &nmap, args->flist);
 			if (error)
@@ -1975,33 +1973,16 @@ xfs_da_do_buf(
 		/*
 		 * Optimize the one-block case.
 		 */
-		if (nfsb == 1) {
-			xfs_fsblock_t	fsb;
-
-			if ((error =
-			    xfs_bmapi_single(trans, dp, whichfork, &fsb,
-				    (xfs_fileoff_t)bno))) {
-				return error;
-			}
+		if (nfsb == 1)
 			mapp = &map;
-			if (fsb == NULLFSBLOCK) {
-				nmap = 0;
-			} else {
-				map.br_startblock = fsb;
-				map.br_startoff = (xfs_fileoff_t)bno;
-				map.br_blockcount = 1;
-				nmap = 1;
-			}
-		} else {
+		else
 			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
-			nmap = nfsb;
-			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
-					nfsb,
-					XFS_BMAPI_METADATA |
-						xfs_bmapi_aflag(whichfork),
-					NULL, 0, mapp, &nmap, NULL)))
-				goto exit0;
-		}
+
+		nmap = nfsb;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
+				       &nmap, xfs_bmapi_aflag(whichfork));
+		if (error)
+			goto exit0;
 	} else {
 		map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
 		map.br_startoff = (xfs_fileoff_t)bno;
@@ -2072,13 +2053,10 @@ xfs_da_do_buf(
 		if (!bp)
 			continue;
 		if (caller == 1) {
-			if (whichfork == XFS_ATTR_FORK) {
-				XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
-						XFS_ATTR_BTREE_REF);
-			} else {
-				XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
-						XFS_DIR_BTREE_REF);
-			}
+			if (whichfork == XFS_ATTR_FORK)
+				xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+			else
+				xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
 		}
 		if (bplist) {
 			bplist[nbplist++] = bp;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 9a84a85c03b..654dc6f05ba 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -425,8 +425,8 @@ xfs_swap_extents(
 	}
 
 
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
 	xfs_trans_log_inode(tp, ip,  ilf_fields);
 	xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -438,7 +438,7 @@ xfs_swap_extents(
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
 
-	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+	error = xfs_trans_commit(tp, 0);
 
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ca2386d82cd..66e108f561a 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents(
 				 * we already have in the table.
 				 */
 				nmap = map_size - map_valid;
-				error = xfs_bmapi(NULL, dp,
-					map_off,
+				error = xfs_bmapi_read(dp, map_off,
 					xfs_dir2_byte_to_da(mp,
 						XFS_DIR2_LEAF_OFFSET) - map_off,
-					XFS_BMAPI_METADATA, NULL, 0,
-					&map[map_valid], &nmap, NULL);
+					&map[map_valid], &nmap, 0);
 				/*
 				 * Don't know if we should ignore this or
 				 * try to return an error.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae3..8a24f0c6c86 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -38,7 +38,7 @@ xfs_trim_extents(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
 	xfs_fsblock_t		start,
-	xfs_fsblock_t		len,
+	xfs_fsblock_t		end,
 	xfs_fsblock_t		minlen,
 	__uint64_t		*blocks_trimmed)
 {
@@ -100,7 +100,7 @@ xfs_trim_extents(
 		 * down partially overlapping ranges for now.
 		 */
 		if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
-		    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+		    XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
 			trace_xfs_discard_exclude(mp, agno, fbno, flen);
 			goto next_extent;
 		}
@@ -145,7 +145,7 @@ xfs_ioc_trim(
 	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
 	unsigned int		granularity = q->limits.discard_granularity;
 	struct fstrim_range	range;
-	xfs_fsblock_t		start, len, minlen;
+	xfs_fsblock_t		start, end, minlen;
 	xfs_agnumber_t		start_agno, end_agno, agno;
 	__uint64_t		blocks_trimmed = 0;
 	int			error, last_error = 0;
@@ -165,19 +165,19 @@ xfs_ioc_trim(
 	 * matter as trimming blocks is an advisory interface.
 	 */
 	start = XFS_B_TO_FSBT(mp, range.start);
-	len = XFS_B_TO_FSBT(mp, range.len);
+	end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
 	minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
 
-	start_agno = XFS_FSB_TO_AGNO(mp, start);
-	if (start_agno >= mp->m_sb.sb_agcount)
+	if (start >= mp->m_sb.sb_dblocks)
 		return -XFS_ERROR(EINVAL);
+	if (end > mp->m_sb.sb_dblocks - 1)
+		end = mp->m_sb.sb_dblocks - 1;
 
-	end_agno = XFS_FSB_TO_AGNO(mp, start + len);
-	if (end_agno >= mp->m_sb.sb_agcount)
-		end_agno = mp->m_sb.sb_agcount - 1;
+	start_agno = XFS_FSB_TO_AGNO(mp, start);
+	end_agno = XFS_FSB_TO_AGNO(mp, end);
 
 	for (agno = start_agno; agno <= end_agno; agno++) {
-		error = -xfs_trim_extents(mp, agno, start, len, minlen,
+		error = -xfs_trim_extents(mp, agno, start, end, minlen,
 					  &blocks_trimmed);
 		if (error)
 			last_error = error;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index db62959bed1..25d7280e9f6 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -377,16 +377,14 @@ xfs_qm_dqalloc(
 		return (ESRCH);
 	}
 
-	xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
 	nmaps = 1;
-	if ((error = xfs_bmapi(tp, quotip,
-			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
-			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
-			      &firstblock,
-			      XFS_QM_DQALLOC_SPACE_RES(mp),
-			      &map, &nmaps, &flist))) {
+	error = xfs_bmapi_write(tp, quotip, offset_fsb,
+				XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+				&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+				&map, &nmaps, &flist);
+	if (error)
 		goto error0;
-	}
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
 	ASSERT(nmaps == 1);
 	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -402,8 +400,11 @@ xfs_qm_dqalloc(
 			       dqp->q_blkno,
 			       mp->m_quotainfo->qi_dqchunklen,
 			       0);
-	if (!bp || (error = xfs_buf_geterror(bp)))
+
+	error = xfs_buf_geterror(bp);
+	if (error)
 		goto error1;
+
 	/*
 	 * Make a chunk of dquots out of this buffer and log
 	 * the entire thing.
@@ -485,9 +486,8 @@ xfs_qm_dqtobp(
 	/*
 	 * Find the block map; no allocations yet
 	 */
-	error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
-			  XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
-			  NULL, 0, &map, &nmaps, NULL);
+	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
+			       XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
 
 	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
 	if (error)
@@ -605,7 +605,7 @@ xfs_qm_dqread(
 	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
 
 	/* Mark the buf so that this will stay incore a little longer */
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
 
 	/*
 	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
@@ -1242,9 +1242,11 @@ xfs_qm_dqflush(
 	}
 
 	if (flags & SYNC_WAIT)
-		error = xfs_bwrite(mp, bp);
+		error = xfs_bwrite(bp);
 	else
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+
+	xfs_buf_relse(bp);
 
 	trace_xfs_dqflush_done(dqp);
 
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c..bb3f71d236d 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
  * search the buffer cache can be a time consuming thing, and AIL lock is a
  * spinlock.
  */
-STATIC void
+STATIC bool
 xfs_qm_dquot_logitem_pushbuf(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
 	struct xfs_dquot	*dqp = qlip->qli_dquot;
 	struct xfs_buf		*bp;
+	bool			ret = true;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
 	if (completion_done(&dqp->q_flush) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_dqunlock(dqp);
-		return;
+		return true;
 	}
 
 	bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
 			dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
 	xfs_dqunlock(dqp);
 	if (!bp)
-		return;
+		return true;
 	if (XFS_BUF_ISDELAYWRITE(bp))
 		xfs_buf_delwri_promote(bp);
+	if (xfs_buf_ispinned(bp))
+		ret = false;
 	xfs_buf_relse(bp);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48..da108977b21 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error = 0;
+	xfs_lsn_t		lsn = 0;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	if (xfs_ipincount(ip)) {
-		error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
-				XFS_LOG_SYNC, NULL);
-	}
+	if (xfs_ipincount(ip))
+		lsn = ip->i_itemp->ili_last_lsn;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
-	return error;
+	if (!lsn)
+		return 0;
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea..753ed9b5c70 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -124,6 +124,35 @@ xfs_iozero(
 	return (-status);
 }
 
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+	struct file		*file,
+	loff_t			start,
+	loff_t			end,
+	int			datasync)
+{
+	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_lsn_t		lsn = 0;
+
+	trace_xfs_dir_fsync(ip);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip))
+		lsn = ip->i_itemp->ili_last_lsn;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!lsn)
+		return 0;
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
 STATIC int
 xfs_file_fsync(
 	struct file		*file,
@@ -137,6 +166,7 @@ xfs_file_fsync(
 	struct xfs_trans	*tp;
 	int			error = 0;
 	int			log_flushed = 0;
+	xfs_lsn_t		lsn = 0;
 
 	trace_xfs_file_fsync(ip);
 
@@ -149,10 +179,6 @@ xfs_file_fsync(
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	xfs_ioend_wait(ip);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
 	if (mp->m_flags & XFS_MOUNT_BARRIER) {
 		/*
 		 * If we have an RT and/or log subvolume we need to make sure
@@ -216,11 +242,11 @@ xfs_file_fsync(
 		 * transaction.	 So we play it safe and fire off the
 		 * transaction anyway.
 		 */
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		xfs_trans_set_sync(tp);
-		error = _xfs_trans_commit(tp, 0, &log_flushed);
+		error = xfs_trans_commit(tp, 0);
 
+		lsn = ip->i_itemp->ili_last_lsn;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	} else {
 		/*
@@ -231,14 +257,14 @@ xfs_file_fsync(
 		 * disk yet, the inode will be still be pinned.  If it is,
 		 * force the log.
 		 */
-		if (xfs_ipincount(ip)) {
-			error = _xfs_log_force_lsn(mp,
-					ip->i_itemp->ili_last_lsn,
-					XFS_LOG_SYNC, &log_flushed);
-		}
+		if (xfs_ipincount(ip))
+			lsn = ip->i_itemp->ili_last_lsn;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	}
 
+	if (!error && lsn)
+		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+
 	/*
 	 * If we only have a single device, and the log force about was
 	 * a no-op we might have to flush the data device cache here.
@@ -317,7 +343,19 @@ xfs_file_aio_read(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (unlikely(ioflags & IO_ISDIRECT)) {
+	/*
+	 * Locking is a bit tricky here. If we take an exclusive lock
+	 * for direct IO, we effectively serialise all new concurrent
+	 * read IO to this file and block it behind IO that is currently in
+	 * progress because IO in progress holds the IO lock shared. We only
+	 * need to hold the lock exclusive to blow away the page cache, so
+	 * only take lock exclusively if the page cache needs invalidation.
+	 * This allows the normal direct IO case of no page cache pages to
+	 * proceeed concurrently without serialisation.
+	 */
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
 		if (inode->i_mapping->nrpages) {
@@ -330,8 +368,7 @@ xfs_file_aio_read(
 			}
 		}
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-	} else
-		xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	}
 
 	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 
@@ -407,11 +444,13 @@ xfs_aio_write_isize_update(
  */
 STATIC void
 xfs_aio_write_newsize_update(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	xfs_fsize_t		new_size)
 {
-	if (ip->i_new_size) {
+	if (new_size == ip->i_new_size) {
 		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
+		if (new_size == ip->i_new_size)
+			ip->i_new_size = 0;
 		if (ip->i_d.di_size > ip->i_size)
 			ip->i_d.di_size = ip->i_size;
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -462,7 +501,7 @@ xfs_file_splice_write(
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 
 	xfs_aio_write_isize_update(inode, ppos, ret);
-	xfs_aio_write_newsize_update(ip);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -500,11 +539,9 @@ xfs_zero_last_block(
 
 	last_fsb = XFS_B_TO_FSBT(mp, isize);
 	nimaps = 1;
-	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL);
-	if (error) {
+	error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+	if (error)
 		return error;
-	}
 	ASSERT(nimaps > 0);
 	/*
 	 * If the block underlying isize is just a hole, then there
@@ -595,8 +632,8 @@ xfs_zero_eof(
 	while (start_zero_fsb <= end_zero_fsb) {
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL);
+		error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
+					  &imap, &nimaps, 0);
 		if (error) {
 			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 			return error;
@@ -659,6 +696,7 @@ xfs_file_aio_write_checks(
 	struct file		*file,
 	loff_t			*pos,
 	size_t			*count,
+	xfs_fsize_t		*new_sizep,
 	int			*iolock)
 {
 	struct inode		*inode = file->f_mapping->host;
@@ -666,6 +704,9 @@ xfs_file_aio_write_checks(
 	xfs_fsize_t		new_size;
 	int			error = 0;
 
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+	*new_sizep = 0;
+restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -673,20 +714,41 @@ xfs_file_aio_write_checks(
 		return error;
 	}
 
-	new_size = *pos + *count;
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
-
 	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
 		file_update_time(file);
 
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
-	 * write.
+	 * write. There is no need to issue zeroing if another in-flght IO ends
+	 * at or before this one If zeronig is needed and we are currently
+	 * holding the iolock shared, we need to update it to exclusive which
+	 * involves dropping all locks and relocking to maintain correct locking
+	 * order. If we do this, restart the function to ensure all checks and
+	 * values are still valid.
 	 */
-	if (*pos > ip->i_size)
+	if ((ip->i_new_size && *pos > ip->i_new_size) ||
+	    (!ip->i_new_size && *pos > ip->i_size)) {
+		if (*iolock == XFS_IOLOCK_SHARED) {
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+			*iolock = XFS_IOLOCK_EXCL;
+			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+			goto restart;
+		}
 		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+	}
+
+	/*
+	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+	 * We have already zeroed space beyond EOF (if necessary).  Only update
+	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
+	 */
+	new_size = *pos + *count;
+	if (new_size > ip->i_size) {
+		if (new_size > ip->i_new_size)
+			ip->i_new_size = new_size;
+		*new_sizep = new_size;
+	}
 
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
@@ -721,7 +783,7 @@ xfs_file_aio_write_checks(
  * the dio layer.  To avoid the problem with aio, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
  *
  * Returns with locks held indicated by @iolock and errors indicated by
  * negative return values.
@@ -733,6 +795,7 @@ xfs_file_dio_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
+	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -753,18 +816,35 @@ xfs_file_dio_aio_write(
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
 		unaligned_io = 1;
 
-	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+	/*
+	 * We don't need to take an exclusive lock unless there page cache needs
+	 * to be invalidated or unaligned IO is being executed. We don't need to
+	 * consider the EOF extension case here because
+	 * xfs_file_aio_write_checks() will relock the inode as necessary for
+	 * EOF zeroing cases and fill out the new inode size as appropriate.
+	 */
+	if (unaligned_io || mapping->nrpages)
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+	xfs_rw_ilock(ip, *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	/*
+	 * Recheck if there are cached pages that need invalidate after we got
+	 * the iolock to protect against other threads adding new pages while
+	 * we were waiting for the iolock.
+	 */
+	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, *iolock);
+	}
+
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
 	if (mapping->nrpages) {
-		WARN_ON(*iolock != XFS_IOLOCK_EXCL);
 		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
 							FI_REMAPF_LOCKED);
 		if (ret)
@@ -776,7 +856,7 @@ xfs_file_dio_aio_write(
 	 * otherwise demote the lock if we had to flush cached pages
 	 */
 	if (unaligned_io)
-		xfs_ioend_wait(ip);
+		inode_dio_wait(inode);
 	else if (*iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		*iolock = XFS_IOLOCK_SHARED;
@@ -798,6 +878,7 @@ xfs_file_buffered_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
+	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -809,9 +890,9 @@ xfs_file_buffered_aio_write(
 	size_t			count = ocount;
 
 	*iolock = XFS_IOLOCK_EXCL;
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+	xfs_rw_ilock(ip, *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
@@ -851,6 +932,7 @@ xfs_file_aio_write(
 	ssize_t			ret;
 	int			iolock;
 	size_t			ocount = 0;
+	xfs_fsize_t		new_size = 0;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -870,10 +952,10 @@ xfs_file_aio_write(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						ocount, &new_size, &iolock);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						ocount, &new_size, &iolock);
 
 	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
@@ -894,7 +976,7 @@ xfs_file_aio_write(
 	}
 
 out_unlock:
-	xfs_aio_write_newsize_update(ip);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
@@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
-	.fsync		= xfs_file_fsync,
+	.fsync		= xfs_dir_fsync,
 };
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3ff3d9e23de..5170306a100 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -682,7 +682,7 @@ xfs_filestream_new_ag(
 	ip = ap->ip;
 	mp = ip->i_mount;
 	cache = mp->m_filestream;
-	minlen = ap->alen;
+	minlen = ap->length;
 	*agp = NULLAGNUMBER;
 
 	/*
@@ -761,7 +761,7 @@ xfs_filestream_new_ag(
 	 */
 	ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
 	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
-	        (ap->low ? XFS_PICK_LOWSPACE : 0);
+	        (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
 
 	err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
 	if (err || *agp == NULLAGNUMBER)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77ca..1c6fdeb702f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
 				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		agf = XFS_BUF_TO_AGF(bp);
 		memset(agf, 0, mp->m_sb.sb_sectsize);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -216,16 +220,21 @@ xfs_growfs_data_private(
 		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
 		agf->agf_freeblks = cpu_to_be32(tmpsize);
 		agf->agf_longest = cpu_to_be32(tmpsize);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * AG inode header block
 		 */
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		agi = XFS_BUF_TO_AGI(bp);
 		memset(agi, 0, mp->m_sb.sb_sectsize);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -240,10 +249,11 @@ xfs_growfs_data_private(
 		agi->agi_dirino = cpu_to_be32(NULLAGINO);
 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
 			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * BNO btree root block
 		 */
@@ -251,6 +261,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -262,10 +276,11 @@ xfs_growfs_data_private(
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * CNT btree root block
 		 */
@@ -273,6 +288,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -285,10 +304,11 @@ xfs_growfs_data_private(
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
 		nfree += be32_to_cpu(arec->ar_blockcount);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * INO btree root block
 		 */
@@ -296,6 +316,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -303,10 +327,10 @@ xfs_growfs_data_private(
 		block->bb_numrecs = 0;
 		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
 	}
 	xfs_trans_agblocks_delta(tp, nfree);
 	/*
@@ -396,9 +420,9 @@ xfs_growfs_data_private(
 		 * just issue a warning and continue.  The real work is
 		 * already done and committed.
 		 */
-		if (!(error = xfs_bwrite(mp, bp))) {
-			continue;
-		} else {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error) {
 			xfs_warn(mp,
 		"write error %d updating secondary superblock for ag %d",
 				error, agno);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9f24ec28283..169380e6605 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -150,7 +150,7 @@ xfs_check_agi_freecount(
 /*
  * Initialise a new set of inodes.
  */
-STATIC void
+STATIC int
 xfs_ialloc_inode_init(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
@@ -202,8 +202,8 @@ xfs_ialloc_inode_init(
 		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					 mp->m_bsize * blks_per_cluster,
 					 XBF_LOCK);
-		ASSERT(!xfs_buf_geterror(fbuf));
-
+		if (!fbuf)
+			return ENOMEM;
 		/*
 		 * Initialize all inodes in this buffer and then log them.
 		 *
@@ -225,6 +225,7 @@ xfs_ialloc_inode_init(
 		}
 		xfs_trans_inode_alloc_buf(tp, fbuf);
 	}
+	return 0;
 }
 
 /*
@@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc(
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
-	xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
-			      random32());
+	error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+			args.len, random32());
 
+	if (error)
+		return error;
 	/*
 	 * Convert the results.
 	 */
@@ -1502,7 +1505,7 @@ xfs_read_agi(
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
-	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
 
 	xfs_check_agi_unlinked(agi);
 	return 0;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bb..0fa98b1c70e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
 		return NULL;
 	}
 
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
 	}
 
 	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0239a7c7c88..c0237c602f1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -190,12 +190,6 @@ xfs_imap_to_bp(
 	}
 
 	xfs_inobp_check(mp, bp);
-
-	/*
-	 * Mark the buffer as an inode buffer now that it looks good
-	 */
-	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
 	*bpp = bp;
 	return 0;
 }
@@ -1152,7 +1146,7 @@ xfs_ialloc(
 	/*
 	 * Log the new values stuffed into the inode.
 	 */
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, flags);
 
 	/* now that we have an i_mode we can setup inode ops and unlock */
@@ -1187,6 +1181,7 @@ xfs_isize_check(
 	xfs_fileoff_t		map_first;
 	int			nimaps;
 	xfs_bmbt_irec_t		imaps[2];
+	int			error;
 
 	if (!S_ISREG(ip->i_d.di_mode))
 		return;
@@ -1203,13 +1198,12 @@ xfs_isize_check(
 	 * The filesystem could be shutting down, so bmapi may return
 	 * an error.
 	 */
-	if (xfs_bmapi(NULL, ip, map_first,
+	error = xfs_bmapi_read(ip, map_first,
 			 (XFS_B_TO_FSB(mp,
-				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
-			  map_first),
-			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-			 NULL))
-	    return;
+			       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
+			 imaps, &nimaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		return;
 	ASSERT(nimaps == 1);
 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
 }
@@ -1297,7 +1291,7 @@ xfs_itruncate_extents(
 		 */
 		error = xfs_bmap_finish(&tp, &free_list, &committed);
 		if (committed)
-			xfs_trans_ijoin(tp, ip);
+			xfs_trans_ijoin(tp, ip, 0);
 		if (error)
 			goto out_bmap_cancel;
 
@@ -1313,7 +1307,7 @@ xfs_itruncate_extents(
 		error = xfs_trans_commit(tp, 0);
 		tp = ntp;
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		if (error)
 			goto out;
@@ -1644,7 +1638,7 @@ xfs_iunlink_remove(
  * inodes that are in memory - they all must be marked stale and attached to
  * the cluster buffer.
  */
-STATIC void
+STATIC int
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
 	xfs_trans_t	*tp,
@@ -1690,6 +1684,8 @@ xfs_ifree_cluster(
 					mp->m_bsize * blks_per_cluster,
 					XBF_LOCK);
 
+		if (!bp)
+			return ENOMEM;
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
 		 * stale. These will all have the flush locks held, so an
@@ -1799,6 +1795,7 @@ retry:
 	}
 
 	xfs_perag_put(pag);
+	return 0;
 }
 
 /*
@@ -1878,10 +1875,10 @@ xfs_ifree(
 	dip->di_mode = 0;
 
 	if (delete) {
-		xfs_ifree_cluster(ip, tp, first_ino);
+		error = xfs_ifree_cluster(ip, tp, first_ino);
 	}
 
-	return 0;
+	return error;
 }
 
 /*
@@ -2472,11 +2469,11 @@ cluster_corrupt_out:
 		 */
 		if (bp->b_iodone) {
 			XFS_BUF_UNDONE(bp);
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			xfs_buf_ioerror(bp, EIO);
 			xfs_buf_ioend(bp, 0);
 		} else {
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			xfs_buf_relse(bp);
 		}
 	}
@@ -2597,9 +2594,11 @@ xfs_iflush(
 		goto cluster_corrupt_out;
 
 	if (flags & SYNC_WAIT)
-		error = xfs_bwrite(mp, bp);
+		error = xfs_bwrite(bp);
 	else
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+
+	xfs_buf_relse(bp);
 	return error;
 
 corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbec..760140d1dd6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
 
 	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
-	atomic_t		i_iocount;	/* outstanding I/O count */
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a3..b7cf21ba240 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -658,10 +658,8 @@ xfs_inode_item_unlock(
 
 	lock_flags = iip->ili_lock_flags;
 	iip->ili_lock_flags = 0;
-	if (lock_flags) {
+	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
-		IRELE(ip);
-	}
 }
 
 /*
@@ -708,13 +706,14 @@ xfs_inode_item_committed(
  * marked delayed write. If that's the case, we'll promote it and that will
  * allow the caller to write the buffer by triggering the xfsbufd to run.
  */
-STATIC void
+STATIC bool
 xfs_inode_item_pushbuf(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
 	struct xfs_buf		*bp;
+	bool			ret = true;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
@@ -725,7 +724,7 @@ xfs_inode_item_pushbuf(
 	if (completion_done(&ip->i_flush) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		return;
+		return true;
 	}
 
 	bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -733,10 +732,13 @@ xfs_inode_item_pushbuf(
 
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	if (!bp)
-		return;
+		return true;
 	if (XFS_BUF_ISDELAYWRITE(bp))
 		xfs_buf_delwri_promote(bp);
+	if (xfs_buf_ispinned(bp))
+		ret = false;
 	xfs_buf_relse(bp);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7ce7debe14..d99a9051890 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1069,7 +1069,7 @@ xfs_ioctl_setattr(
 		}
 	}
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 091d82b94c4..9afa282aa93 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -208,22 +208,20 @@ xfs_iomap_write_direct(
 	if (error)
 		goto error1;
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
-	bmapi_flag = XFS_BMAPI_WRITE;
+	bmapi_flag = 0;
 	if (offset < ip->i_size || extsz)
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
-	 * Issue the xfs_bmapi() call to allocate the blocks.
-	 *
 	 * From this point onwards we overwrite the imap pointer that the
 	 * caller gave to us.
 	 */
 	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
-	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
-		&firstfsb, 0, imap, &nimaps, &free_list);
+	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+				&firstfsb, 0, imap, &nimaps, &free_list);
 	if (error)
 		goto error0;
 
@@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate(
 	while (count_fsb > 0) {
 		imaps = nimaps;
 		firstblock = NULLFSBLOCK;
-		error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
-				  &firstblock, 0, imap, &imaps, NULL);
+		error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
+				       0);
 		if (error)
 			return error;
 		for (n = 0; n < imaps; n++) {
@@ -381,7 +379,6 @@ xfs_iomap_write_delay(
 	xfs_fileoff_t	last_fsb;
 	xfs_off_t	aligned_offset;
 	xfs_fileoff_t	ioalign;
-	xfs_fsblock_t	firstblock;
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
@@ -425,12 +422,8 @@ retry:
 	}
 
 	nimaps = XFS_WRITE_IMAPS;
-	firstblock = NULLFSBLOCK;
-	error = xfs_bmapi(NULL, ip, offset_fsb,
-			  (xfs_filblks_t)(last_fsb - offset_fsb),
-			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
-			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-			  &nimaps, NULL);
+	error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
+				imap, &nimaps, XFS_BMAPI_ENTIRE);
 	switch (error) {
 	case 0:
 	case ENOSPC:
@@ -535,7 +528,7 @@ xfs_iomap_write_allocate(
 				return XFS_ERROR(error);
 			}
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip);
+			xfs_trans_ijoin(tp, ip, 0);
 
 			xfs_bmap_init(&free_list, &first_block);
 
@@ -587,14 +580,12 @@ xfs_iomap_write_allocate(
 			}
 
 			/*
-			 * Go get the actual blocks.
-	 	 	 *
 			 * From this point onwards we overwrite the imap
 			 * pointer that the caller gave to us.
 			 */
-			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
-					XFS_BMAPI_WRITE, &first_block, 1,
-					imap, &nimaps, &free_list);
+			error = xfs_bmapi_write(tp, ip, map_start_fsb,
+						count_fsb, 0, &first_block, 1,
+						imap, &nimaps, &free_list);
 			if (error)
 				goto trans_cancel;
 
@@ -701,15 +692,15 @@ xfs_iomap_write_unwritten(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		/*
 		 * Modify the unwritten extent state of the buffer.
 		 */
 		xfs_bmap_init(&free_list, &firstfsb);
 		nimaps = 1;
-		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
+		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+				  XFS_BMAPI_CONVERT, &firstfsb,
 				  1, &imap, &nimaps, &free_list);
 		if (error)
 			goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 673704fab74..9ba2a07b734 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -102,37 +102,38 @@ xfs_mark_inode_dirty(
 
 }
 
+
+int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		   void *fs_info)
+{
+	const struct xattr *xattr;
+	struct xfs_inode *ip = XFS_I(inode);
+	int error = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		error = xfs_attr_set(ip, xattr->name, xattr->value,
+				     xattr->value_len, ATTR_SECURE);
+		if (error < 0)
+			break;
+	}
+	return error;
+}
+
 /*
  * Hook in SELinux.  This is not quite correct yet, what we really need
  * here (as we do for default ACLs) is a mechanism by which creation of
  * these attrs can be journalled at inode creation time (along with the
  * inode, of course, such that log replay can't cause these to be lost).
  */
+
 STATIC int
 xfs_init_security(
 	struct inode	*inode,
 	struct inode	*dir,
 	const struct qstr *qstr)
 {
-	struct xfs_inode *ip = XFS_I(inode);
-	size_t		length;
-	void		*value;
-	unsigned char	*name;
-	int		error;
-
-	error = security_inode_init_security(inode, dir, qstr, (char **)&name,
-					     &value, &length);
-	if (error) {
-		if (error == -EOPNOTSUPP)
-			return 0;
-		return -error;
-	}
-
-	error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-
-	kfree(name);
-	kfree(value);
-	return error;
+	return security_inode_init_security(inode, dir, qstr,
+					    &xfs_initxattrs, NULL);
 }
 
 static void
@@ -465,7 +466,7 @@ xfs_vn_getattr(
 	trace_xfs_getattr(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -XFS_ERROR(EIO);
 
 	stat->size = XFS_ISIZE(ip);
 	stat->dev = inode->i_sb->s_dev;
@@ -611,7 +612,7 @@ xfs_setattr_nonsize(
 		}
 	}
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
@@ -833,16 +834,16 @@ xfs_setattr_size(
 	 * care about here.
 	 */
 	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
-					XBF_ASYNC, FI_NONE);
+		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+					FI_NONE);
 		if (error)
 			goto out_unlock;
 	}
 
 	/*
-	 * Wait for all I/O to complete.
+	 * Wait for all direct I/O to complete.
 	 */
-	xfs_ioend_wait(ip);
+	inode_dio_wait(inode);
 
 	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
 				     xfs_get_blocks);
@@ -863,7 +864,7 @@ xfs_setattr_size(
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Only change the c/mtime if we are changing the size or we are
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 1e8a45e74c3..828662f70d6 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -68,6 +68,8 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/list_sort.h>
 
 #include <asm/page.h>
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3a8d4f66d70..2758a6277c5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -880,8 +880,8 @@ xlog_iodone(xfs_buf_t *bp)
 	 */
 	if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
 			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
-		xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
-		XFS_BUF_STALE(bp);
+		xfs_buf_ioerror_alert(bp, __func__);
+		xfs_buf_stale(bp);
 		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
 		/*
 		 * This flag will be propagated to the trans-committed
@@ -1047,7 +1047,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xlog_get_iclog_buffer_size(mp, log);
 
 	error = ENOMEM;
-	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+	bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
 	if (!bp)
 		goto out_free_log;
 	bp->b_iodone = xlog_iodone;
@@ -1247,7 +1247,7 @@ xlog_bdstrat(
 
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		xfs_buf_ioerror(bp, EIO);
-		XFS_BUF_STALE(bp);
+		xfs_buf_stale(bp);
 		xfs_buf_ioend(bp, 0);
 		/*
 		 * It would seem logical to return EIO here, but we rely on
@@ -1387,9 +1387,9 @@ xlog_sync(xlog_t		*log,
 	 */
 	XFS_BUF_WRITE(bp);
 
-	if ((error = xlog_bdstrat(bp))) {
-		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
-				  XFS_BUF_ADDR(bp));
+	error = xlog_bdstrat(bp);
+	if (error) {
+		xfs_buf_ioerror_alert(bp, "xlog_sync");
 		return error;
 	}
 	if (split) {
@@ -1423,9 +1423,9 @@ xlog_sync(xlog_t		*log,
 		/* account for internal log which doesn't start at block #0 */
 		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
 		XFS_BUF_WRITE(bp);
-		if ((error = xlog_bdstrat(bp))) {
-			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
-					  bp, XFS_BUF_ADDR(bp));
+		error = xlog_bdstrat(bp);
+		if (error) {
+			xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
 			return error;
 		}
 	}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a199dbcee7d..541a508adea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -183,8 +183,7 @@ xlog_bread_noalign(
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_buf_iowait(bp);
 	if (error)
-		xfs_ioerror_alert("xlog_bread", log->l_mp,
-				  bp, XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 	return error;
 }
 
@@ -268,9 +267,10 @@ xlog_bwrite(
 	xfs_buf_lock(bp);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 
-	if ((error = xfs_bwrite(log->l_mp, bp)))
-		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
-				  bp, XFS_BUF_ADDR(bp));
+	error = xfs_bwrite(bp);
+	if (error)
+		xfs_buf_ioerror_alert(bp, __func__);
+	xfs_buf_relse(bp);
 	return error;
 }
 
@@ -361,9 +361,7 @@ xlog_recover_iodone(
 		 * We're not going to bother about retrying
 		 * this during recovery. One strike!
 		 */
-		xfs_ioerror_alert("xlog_recover_iodone",
-					bp->b_target->bt_mount, bp,
-					XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 		xfs_force_shutdown(bp->b_target->bt_mount,
 					SHUTDOWN_META_IO_ERROR);
 	}
@@ -2135,8 +2133,7 @@ xlog_recover_buffer_pass2(
 		return XFS_ERROR(ENOMEM);
 	error = bp->b_error;
 	if (error) {
-		xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-				  bp, buf_f->blf_blkno);
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
 		xfs_buf_relse(bp);
 		return error;
 	}
@@ -2171,15 +2168,16 @@ xlog_recover_buffer_pass2(
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
-		XFS_BUF_STALE(bp);
-		error = xfs_bwrite(mp, bp);
+		xfs_buf_stale(bp);
+		error = xfs_bwrite(bp);
 	} else {
 		ASSERT(bp->b_target->bt_mount == mp);
 		bp->b_iodone = xlog_recover_iodone;
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
 	}
 
-	return (error);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 STATIC int
@@ -2230,8 +2228,7 @@ xlog_recover_inode_pass2(
 	}
 	error = bp->b_error;
 	if (error) {
-		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
-				  bp, in_f->ilf_blkno);
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
 		xfs_buf_relse(bp);
 		goto error;
 	}
@@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2(
 write_inode_buffer:
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
-	xfs_bdwrite(mp, bp);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_relse(bp);
 error:
 	if (need_free)
 		kmem_free(in_f);
@@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2(
 			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
 			     0, &bp);
 	if (error) {
-		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
-				  bp, dq_f->qlf_blkno);
+		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
 		return error;
 	}
 	ASSERT(bp);
@@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2(
 	ASSERT(dq_f->qlf_size == 2);
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
-	xfs_bdwrite(mp, bp);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_relse(bp);
 
 	return (0);
 }
@@ -3656,7 +3654,7 @@ xlog_do_recover(
 		return error;
 	}
 
-	XFS_bflush(log->l_mp->m_ddev_targp);
+	xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
 
 	/*
 	 * If IO errors happened during recovery, bail out.
@@ -3689,8 +3687,7 @@ xlog_do_recover(
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_buf_iowait(bp);
 	if (error) {
-		xfs_ioerror_alert("xlog_do_recover",
-				  log->l_mp, bp, XFS_BUF_ADDR(bp));
+		xfs_buf_ioerror_alert(bp, __func__);
 		ASSERT(0);
 		xfs_buf_relse(bp);
 		return error;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0081657ad98..d06afbc3540 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,9 +44,6 @@
 #include "xfs_trace.h"
 
 
-STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
-
-
 #ifdef HAVE_PERCPU_SB
 STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 						int);
@@ -1484,7 +1481,7 @@ xfs_unmountfs(
 	 * state as much as possible.
 	 */
 	xfs_reclaim_inodes(mp, 0);
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
@@ -1496,11 +1493,6 @@ xfs_unmountfs(
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
 
-	xfs_binval(mp->m_ddev_targp);
-	if (mp->m_rtdev_targp) {
-		xfs_binval(mp->m_rtdev_targp);
-	}
-
 	/*
 	 * Unreserve any blocks we have so that when we unmount we don't account
 	 * the reserved free space as used. This is really only necessary for
@@ -1526,7 +1518,16 @@ xfs_unmountfs(
 		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 	xfs_unmountfs_writesb(mp);
-	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
+
+	/*
+	 * Make sure all buffers have been flushed and completed before
+	 * unmounting the log.
+	 */
+	error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+	if (error)
+		xfs_warn(mp, "%d busy buffers during unmount.", error);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+
 	xfs_log_unmount_write(mp);
 	xfs_log_unmount(mp);
 	xfs_uuid_unmount(mp);
@@ -1537,16 +1538,6 @@ xfs_unmountfs(
 	xfs_free_perag(mp);
 }
 
-STATIC void
-xfs_unmountfs_wait(xfs_mount_t *mp)
-{
-	if (mp->m_logdev_targp != mp->m_ddev_targp)
-		xfs_wait_buftarg(mp->m_logdev_targp);
-	if (mp->m_rtdev_targp)
-		xfs_wait_buftarg(mp->m_rtdev_targp);
-	xfs_wait_buftarg(mp->m_ddev_targp);
-}
-
 int
 xfs_fs_writable(xfs_mount_t *mp)
 {
@@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 
 		XFS_BUF_UNDONE(sbp);
 		XFS_BUF_UNREAD(sbp);
-		XFS_BUF_UNDELAYWRITE(sbp);
+		xfs_buf_delwri_dequeue(sbp);
 		XFS_BUF_WRITE(sbp);
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(sbp->b_target == mp->m_ddev_targp);
 		xfsbdstrat(mp, sbp);
 		error = xfs_buf_iowait(sbp);
 		if (error)
-			xfs_ioerror_alert("xfs_unmountfs_writesb",
-					  mp, sbp, XFS_BUF_ADDR(sbp));
+			xfs_buf_ioerror_alert(sbp, __func__);
 		xfs_buf_relse(sbp);
 	}
 	return error;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 9a0aa76facd..5cff443f6cd 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs(
 			break;
 
 		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
 		/*
 		 * goto the next block.
 		 */
@@ -1346,11 +1347,8 @@ xfs_qm_dqiterate(
 		 * the inode is never added to the transaction.
 		 */
 		xfs_ilock(qip, XFS_ILOCK_SHARED);
-		error = xfs_bmapi(NULL, qip, lblkno,
-				  maxlblkcnt - lblkno,
-				  XFS_BMAPI_METADATA,
-				  NULL,
-				  0, map, &nmaps, NULL);
+		error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
+				       map, &nmaps, 0);
 		xfs_iunlock(qip, XFS_ILOCK_SHARED);
 		if (error)
 			break;
@@ -1683,7 +1681,7 @@ xfs_qm_quotacheck(
 	 * quotacheck'd stamp on the superblock. So, here we do a synchronous
 	 * flush.
 	 */
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 
 	/*
 	 * If one type of quotas is off, then it will lose its
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6..5cc3dde1bc9 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	error = xfs_itruncate_data(&tp, ip, 0);
 	if (error) {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index df78c297d1a..866de277079 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -170,12 +170,12 @@ xfs_rename(
 	 * we can rely on either trans_commit or trans_cancel to unlock
 	 * them.
 	 */
-	xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent)
-		xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
 	if (target_ip)
-		xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we are using project inheritance, we only allow renames
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 35561a511b5..87323f1ded6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -112,7 +112,7 @@ xfs_growfs_rt_alloc(
 		 * Lock the inode.
 		 */
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 		xfs_bmap_init(&flist, &firstblock);
 		/*
@@ -120,9 +120,9 @@ xfs_growfs_rt_alloc(
 		 */
 		nmap = 1;
 		cancelflags |= XFS_TRANS_ABORT;
-		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
-			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-			resblks, &map, &nmap, &flist);
+		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+					XFS_BMAPI_METADATA, &firstblock,
+					resblks, &map, &nmap, &flist);
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
@@ -155,7 +155,7 @@ xfs_growfs_rt_alloc(
 			 * Lock the bitmap inode.
 			 */
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 			/*
 			 * Get a buffer for the block.
 			 */
@@ -856,33 +856,23 @@ xfs_rtbuf_get(
 	xfs_buf_t	**bpp)		/* output: buffer for the block */
 {
 	xfs_buf_t	*bp;		/* block buffer, result */
-	xfs_daddr_t	d;		/* disk addr of block */
-	int		error;		/* error value */
-	xfs_fsblock_t	fsb;		/* fs block number for block */
 	xfs_inode_t	*ip;		/* bitmap or summary inode */
+	xfs_bmbt_irec_t	map;
+	int		nmap;
+	int		error;		/* error value */
 
 	ip = issum ? mp->m_rsumip : mp->m_rbmip;
-	/*
-	 * Map from the file offset (block) and inode number to the
-	 * file system block.
-	 */
-	error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
-	if (error) {
+
+	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+	if (error)
 		return error;
-	}
-	ASSERT(fsb != NULLFSBLOCK);
-	/*
-	 * Convert to disk address for buffer cache.
-	 */
-	d = XFS_FSB_TO_DADDR(mp, fsb);
-	/*
-	 * Read the buffer.
-	 */
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+
+	ASSERT(map.br_startblock != NULLFSBLOCK);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
 				   mp->m_bsize, 0, &bp);
-	if (error) {
+	if (error)
 		return error;
-	}
 	ASSERT(!xfs_buf_geterror(bp));
 	*bpp = bp;
 	return 0;
@@ -1970,7 +1960,7 @@ xfs_growfs_rt(
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
 		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the bitmap inode's size.
 		 */
@@ -1982,7 +1972,7 @@ xfs_growfs_rt(
 		 * Get the summary inode into the transaction.
 		 */
 		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the summary inode's size.
 		 */
@@ -2153,7 +2143,7 @@ xfs_rtfree_extent(
 	 * Synchronize by locking the bitmap inode.
 	 */
 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
 #if defined(__KERNEL__) && defined(DEBUG)
 	/*
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c96a8a05ac0..597d044a09a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -92,24 +92,6 @@ xfs_do_force_shutdown(
 }
 
 /*
- * Prints out an ALERT message about I/O error.
- */
-void
-xfs_ioerror_alert(
-	char			*func,
-	struct xfs_mount	*mp,
-	xfs_buf_t		*bp,
-	xfs_daddr_t		blkno)
-{
-	xfs_alert(mp,
-		 "I/O error occurred: meta-data dev %s block 0x%llx"
-		 "       (\"%s\") error %d buf count %zd",
-		xfs_buf_target_name(bp->b_target),
-		(__uint64_t)blkno, func,
-		bp->b_error, XFS_BUF_COUNT(bp));
-}
-
-/*
  * This isn't an absolute requirement, but it is
  * just a good idea to call xfs_read_buf instead of
  * directly doing a read_buf call. For one, we shouldn't
@@ -143,14 +125,13 @@ xfs_read_buf(
 	} else {
 		*bpp = NULL;
 		if (error) {
-			xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp, __func__);
 		} else {
 			error = XFS_ERROR(EIO);
 		}
 		if (bp) {
 			XFS_BUF_UNDONE(bp);
-			XFS_BUF_UNDELAYWRITE(bp);
-			XFS_BUF_STALE(bp);
+			xfs_buf_stale(bp);
 			/*
 			 * brelse clears B_ERROR and b_error
 			 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 11c41ec6ed7..bbdb9ad6a4b 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 			xfs_daddr_t blkno, int len, uint flags,
 			struct xfs_buf **bpp);
-extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
-				xfs_buf_t *bp, xfs_daddr_t blkno);
 extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
 
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2366c54cc4f..3eca58f51ae 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -796,8 +796,6 @@ xfs_fs_destroy_inode(
 	if (is_bad_inode(inode))
 		goto out_reclaim;
 
-	xfs_ioend_wait(ip);
-
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
 	/*
@@ -837,7 +835,6 @@ xfs_fs_inode_init_once(
 	inode_init_once(VFS_I(ip));
 
 	/* xfs inode */
-	atomic_set(&ip->i_iocount, 0);
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 	init_waitqueue_head(&ip->i_ipin_wait);
@@ -887,7 +884,7 @@ xfs_log_inode(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	return xfs_trans_commit(tp, 0);
 }
@@ -914,9 +911,8 @@ xfs_fs_write_inode(
 		 * of forcing it all the way to stable storage using a
 		 * synchronous transaction we let the log force inside the
 		 * ->sync_fs call do that for thus, which reduces the number
-		 * of synchronous log foces dramatically.
+		 * of synchronous log forces dramatically.
 		 */
-		xfs_ioend_wait(ip);
 		error = xfs_log_inode(ip);
 		if (error)
 			goto out;
@@ -1019,7 +1015,7 @@ xfs_fs_put_super(
 	 */
 	xfs_filestream_unmount(mp);
 
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
@@ -1443,7 +1439,7 @@ xfs_fs_fill_super(
 	 */
 	xfs_filestream_unmount(mp);
 
-	XFS_bflush(mp->m_ddev_targp);
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
 
 	xfs_unmountfs(mp);
 	goto out_free_sb;
@@ -1652,24 +1648,13 @@ xfs_init_workqueues(void)
 	 */
 	xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
 	if (!xfs_syncd_wq)
-		goto out;
-
-	xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
-	if (!xfs_ail_wq)
-		goto out_destroy_syncd;
-
+		return -ENOMEM;
 	return 0;
-
-out_destroy_syncd:
-	destroy_workqueue(xfs_syncd_wq);
-out:
-	return -ENOMEM;
 }
 
 STATIC void
 xfs_destroy_workqueues(void)
 {
-	destroy_workqueue(xfs_ail_wq);
 	destroy_workqueue(xfs_syncd_wq);
 }
 
@@ -1681,7 +1666,6 @@ init_xfs_fs(void)
 	printk(KERN_INFO XFS_VERSION_STRING " with "
 			 XFS_BUILD_OPTIONS " enabled\n");
 
-	xfs_ioend_init();
 	xfs_dir_startup();
 
 	error = xfs_init_zones();
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 4604f90f86a..aa3dc1a4d53 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
 	int			error = 0;
 
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		goto out_wait;
+		return 0;
 
 	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
 		if (flags & SYNC_TRYLOCK)
-			goto out_wait;
+			return 0;
 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	}
 
 	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
 				0 : XBF_ASYNC, FI_NONE);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- out_wait:
-	if (flags & SYNC_WAIT)
-		xfs_ioend_wait(ip);
 	return error;
 }
 
@@ -322,6 +318,7 @@ xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
 	struct xfs_buf		*bp;
+	int			error;
 
 	/*
 	 * If the buffer is pinned then push on the log so we won't get stuck
@@ -334,8 +331,9 @@ xfs_sync_fsdata(
 	bp = xfs_getsb(mp, 0);
 	if (xfs_buf_ispinned(bp))
 		xfs_log_force(mp, 0);
-
-	return xfs_bwrite(mp, bp);
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 /*
@@ -379,7 +377,7 @@ xfs_quiesce_data(
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
-		XFS_bflush(mp->m_rtdev_targp);
+		xfs_flush_buftarg(mp->m_rtdev_targp, 1);
 
 	return error ? error : error2;
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd7..f1d2802b2f0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -30,6 +30,7 @@ struct xfs_buf_log_item;
 struct xfs_da_args;
 struct xfs_da_node_entry;
 struct xfs_dquot;
+struct xfs_log_item;
 struct xlog_ticket;
 struct log;
 struct xlog_recover;
@@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
-DEFINE_BUF_EVENT(xfs_buf_bdwrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
@@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap);
 DEFINE_INODE_EVENT(xfs_file_ioctl);
 DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
 DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
 DEFINE_INODE_EVENT(xfs_write_inode);
@@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
 
+DECLARE_EVENT_CLASS(xfs_log_item_class,
+	TP_PROTO(struct xfs_log_item *lip),
+	TP_ARGS(lip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(uint, flags)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_mountp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->lsn = lip->li_lsn;
+	),
+	TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_LOG_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_log_item_class, name, \
+	TP_PROTO(struct xfs_log_item *lip), \
+	TP_ARGS(lip))
+DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+
+
 DECLARE_EVENT_CLASS(xfs_file_class,
 	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
 	TP_ARGS(ip, count, offset, flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index efc147f0e9b..1f35b2feca9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1790,9 +1790,7 @@ xfs_trans_commit_cil(
 }
 
 /*
- * xfs_trans_commit
- *
- * Commit the given transaction to the log a/synchronously.
+ * Commit the given transaction to the log.
  *
  * XFS disk error handling mechanism is not based on a typical
  * transaction abort mechanism. Logically after the filesystem
@@ -1804,10 +1802,9 @@ xfs_trans_commit_cil(
  * Do not reference the transaction structure after this call.
  */
 int
-_xfs_trans_commit(
+xfs_trans_commit(
 	struct xfs_trans	*tp,
-	uint			flags,
-	int			*log_flushed)
+	uint			flags)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	xfs_lsn_t		commit_lsn = -1;
@@ -1866,7 +1863,7 @@ _xfs_trans_commit(
 	if (sync) {
 		if (!error) {
 			error = _xfs_log_force_lsn(mp, commit_lsn,
-				      XFS_LOG_SYNC, log_flushed);
+				      XFS_LOG_SYNC, NULL);
 		}
 		XFS_STATS_INC(xs_trans_sync);
 	} else {
@@ -2021,6 +2018,6 @@ xfs_trans_roll(
 	if (error)
 		return error;
 
-	xfs_trans_ijoin(trans, dp);
+	xfs_trans_ijoin(trans, dp, 0);
 	return 0;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b635..603f3eb5204 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
 	void (*iop_push)(xfs_log_item_t *);
-	void (*iop_pushbuf)(xfs_log_item_t *);
+	bool (*iop_pushbuf)(xfs_log_item_t *);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
 
@@ -470,8 +470,7 @@ void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
-void		xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
-void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
+void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item	*xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -487,10 +486,7 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
 					 struct xfs_efd_log_item *,
 					 xfs_fsblock_t,
 					 xfs_extlen_t);
-int		_xfs_trans_commit(xfs_trans_t *,
-				  uint flags,
-				  int *);
-#define xfs_trans_commit(tp, flags)	_xfs_trans_commit(tp, flags, NULL)
+int		xfs_trans_commit(xfs_trans_t *, uint flags);
 void		xfs_trans_cancel(xfs_trans_t *, int);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c15aa29fa16..ed9252bcdac 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -26,10 +26,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
+#include "xfs_trace.h"
 #include "xfs_error.h"
 
-struct workqueue_struct	*xfs_ail_wq;	/* AIL workqueue */
-
 #ifdef DEBUG
 /*
  * Check that the list is sorted as it should be.
@@ -356,28 +355,34 @@ xfs_ail_delete(
 	xfs_trans_ail_cursor_clear(ailp, lip);
 }
 
-/*
- * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
- * to run at a later time if there is more work to do to complete the push.
- */
-STATIC void
-xfs_ail_worker(
-	struct work_struct	*work)
+static long
+xfsaild_push(
+	struct xfs_ail		*ailp)
 {
-	struct xfs_ail		*ailp = container_of(to_delayed_work(work),
-					struct xfs_ail, xa_work);
 	xfs_mount_t		*mp = ailp->xa_mount;
 	struct xfs_ail_cursor	cur;
 	xfs_log_item_t		*lip;
 	xfs_lsn_t		lsn;
 	xfs_lsn_t		target;
 	long			tout = 10;
-	int			flush_log = 0;
 	int			stuck = 0;
 	int			count = 0;
 	int			push_xfsbufd = 0;
 
+	/*
+	 * If last time we ran we encountered pinned items, force the log first
+	 * and wait for it before pushing again.
+	 */
 	spin_lock(&ailp->xa_lock);
+	if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
+	    !list_empty(&ailp->xa_ail)) {
+		ailp->xa_log_flush = 0;
+		spin_unlock(&ailp->xa_lock);
+		XFS_STATS_INC(xs_push_ail_flush);
+		xfs_log_force(mp, XFS_LOG_SYNC);
+		spin_lock(&ailp->xa_lock);
+	}
+
 	target = ailp->xa_target;
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
 	if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
@@ -421,26 +426,37 @@ xfs_ail_worker(
 		switch (lock_result) {
 		case XFS_ITEM_SUCCESS:
 			XFS_STATS_INC(xs_push_ail_success);
+			trace_xfs_ail_push(lip);
+
 			IOP_PUSH(lip);
 			ailp->xa_last_pushed_lsn = lsn;
 			break;
 
 		case XFS_ITEM_PUSHBUF:
 			XFS_STATS_INC(xs_push_ail_pushbuf);
-			IOP_PUSHBUF(lip);
-			ailp->xa_last_pushed_lsn = lsn;
+			trace_xfs_ail_pushbuf(lip);
+
+			if (!IOP_PUSHBUF(lip)) {
+				trace_xfs_ail_pushbuf_pinned(lip);
+				stuck++;
+				ailp->xa_log_flush++;
+			} else {
+				ailp->xa_last_pushed_lsn = lsn;
+			}
 			push_xfsbufd = 1;
 			break;
 
 		case XFS_ITEM_PINNED:
 			XFS_STATS_INC(xs_push_ail_pinned);
+			trace_xfs_ail_pinned(lip);
+
 			stuck++;
-			flush_log = 1;
+			ailp->xa_log_flush++;
 			break;
 
 		case XFS_ITEM_LOCKED:
 			XFS_STATS_INC(xs_push_ail_locked);
-			ailp->xa_last_pushed_lsn = lsn;
+			trace_xfs_ail_locked(lip);
 			stuck++;
 			break;
 
@@ -480,16 +496,6 @@ xfs_ail_worker(
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
 
-	if (flush_log) {
-		/*
-		 * If something we need to push out was pinned, then
-		 * push out the log so it will become unpinned and
-		 * move forward in the AIL.
-		 */
-		XFS_STATS_INC(xs_push_ail_flush);
-		xfs_log_force(mp, 0);
-	}
-
 	if (push_xfsbufd) {
 		/* we've got delayed write buffers to flush */
 		wake_up_process(mp->m_ddev_targp->bt_task);
@@ -500,20 +506,7 @@ out_done:
 	if (!count) {
 		/* We're past our target or empty, so idle */
 		ailp->xa_last_pushed_lsn = 0;
-
-		/*
-		 * We clear the XFS_AIL_PUSHING_BIT first before checking
-		 * whether the target has changed. If the target has changed,
-		 * this pushes the requeue race directly onto the result of the
-		 * atomic test/set bit, so we are guaranteed that either the
-		 * the pusher that changed the target or ourselves will requeue
-		 * the work (but not both).
-		 */
-		clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
-		smp_rmb();
-		if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
-		    test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
-			return;
+		ailp->xa_log_flush = 0;
 
 		tout = 50;
 	} else if (XFS_LSN_CMP(lsn, target) >= 0) {
@@ -532,14 +525,39 @@ out_done:
 		 * were stuck.
 		 *
 		 * Backoff a bit more to allow some I/O to complete before
-		 * continuing from where we were.
+		 * restarting from the start of the AIL. This prevents us
+		 * from spinning on the same items, and if they are pinned will
+		 * all the restart to issue a log force to unpin the stuck
+		 * items.
 		 */
 		tout = 20;
+		ailp->xa_last_pushed_lsn = 0;
+	}
+
+	return tout;
+}
+
+static int
+xfsaild(
+	void		*data)
+{
+	struct xfs_ail	*ailp = data;
+	long		tout = 0;	/* milliseconds */
+
+	while (!kthread_should_stop()) {
+		if (tout && tout <= 20)
+			__set_current_state(TASK_KILLABLE);
+		else
+			__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(tout ?
+				 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+
+		try_to_freeze();
+
+		tout = xfsaild_push(ailp);
 	}
 
-	/* There is more to do, requeue us.  */
-	queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
-					msecs_to_jiffies(tout));
+	return 0;
 }
 
 /*
@@ -574,8 +592,9 @@ xfs_ail_push(
 	 */
 	smp_wmb();
 	xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
-	if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
-		queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+	smp_wmb();
+
+	wake_up_process(ailp->xa_task);
 }
 
 /*
@@ -813,9 +832,18 @@ xfs_trans_ail_init(
 	INIT_LIST_HEAD(&ailp->xa_ail);
 	INIT_LIST_HEAD(&ailp->xa_cursors);
 	spin_lock_init(&ailp->xa_lock);
-	INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
+
+	ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+			ailp->xa_mount->m_fsname);
+	if (IS_ERR(ailp->xa_task))
+		goto out_free_ailp;
+
 	mp->m_ail = ailp;
 	return 0;
+
+out_free_ailp:
+	kmem_free(ailp);
+	return ENOMEM;
 }
 
 void
@@ -824,6 +852,6 @@ xfs_trans_ail_destroy(
 {
 	struct xfs_ail	*ailp = mp->m_ail;
 
-	cancel_delayed_work_sync(&ailp->xa_work);
+	kthread_stop(ailp->xa_task);
 	kmem_free(ailp);
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 137e2b9e294..475a4ded4f4 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -160,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
 	if (bp != NULL) {
 		ASSERT(xfs_buf_islocked(bp));
-		if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
-			XFS_BUF_SUPER_STALE(bp);
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			xfs_buf_stale(bp);
+			XFS_BUF_DONE(bp);
+		}
 
 		/*
 		 * If the buffer is stale then it was binval'ed
@@ -294,8 +296,7 @@ xfs_trans_read_buf(
 
 		if (bp->b_error) {
 			error = bp->b_error;
-			xfs_ioerror_alert("xfs_trans_read_buf", mp,
-					  bp, blkno);
+			xfs_buf_ioerror_alert(bp, __func__);
 			xfs_buf_relse(bp);
 			return error;
 		}
@@ -337,8 +338,7 @@ xfs_trans_read_buf(
 			xfsbdstrat(tp->t_mountp, bp);
 			error = xfs_buf_iowait(bp);
 			if (error) {
-				xfs_ioerror_alert("xfs_trans_read_buf", mp,
-						  bp, blkno);
+				xfs_buf_ioerror_alert(bp, __func__);
 				xfs_buf_relse(bp);
 				/*
 				 * We can gracefully recover from most read
@@ -387,9 +387,9 @@ xfs_trans_read_buf(
 	}
 	if (bp->b_error) {
 		error = bp->b_error;
-		XFS_BUF_SUPER_STALE(bp);
-		xfs_ioerror_alert("xfs_trans_read_buf", mp,
-				  bp, blkno);
+		xfs_buf_stale(bp);
+		XFS_BUF_DONE(bp);
+		xfs_buf_ioerror_alert(bp, __func__);
 		if (tp->t_flags & XFS_TRANS_DIRTY)
 			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
 		xfs_buf_relse(bp);
@@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	 * inside the b_bdstrat callback so that this won't get written to
 	 * disk.
 	 */
-	XFS_BUF_DELAYWRITE(bp);
 	XFS_BUF_DONE(bp);
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bp->b_iodone = xfs_buf_iodone_callbacks;
 	bip->bli_item.li_cb = xfs_buf_iodone;
 
+	xfs_buf_delwri_queue(bp);
+
 	trace_xfs_trans_log_buf(bip);
 
 	/*
@@ -738,8 +739,7 @@ xfs_trans_binval(
 	 * We set the stale bit in the buffer as well since we're getting
 	 * rid of it.
 	 */
-	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_STALE(bp);
+	xfs_buf_stale(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
 	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
 	bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index c8dea2fd7e6..32f0288ae10 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug(
  * Add a locked inode to the transaction.
  *
  * The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
  */
 void
 xfs_trans_ijoin(
 	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	uint			lock_flags)
 {
 	xfs_inode_log_item_t	*iip;
 
@@ -59,7 +61,9 @@ xfs_trans_ijoin(
 	if (ip->i_itemp == NULL)
 		xfs_inode_item_init(ip, ip->i_mount);
 	iip = ip->i_itemp;
+
 	ASSERT(iip->ili_lock_flags == 0);
+	iip->ili_lock_flags = lock_flags;
 
 	/*
 	 * Get a log_item_desc to point at the new item.
@@ -70,25 +74,6 @@ xfs_trans_ijoin(
 }
 
 /*
- * Add a locked inode to the transaction.
- *
- *
- * Grabs a reference to the inode which will be dropped when the transaction
- * is committed.  The inode will also be unlocked at that point.  The inode
- * must be locked, and it cannot be associated with any transaction.
- */
-void
-xfs_trans_ijoin_ref(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	uint			lock_flags)
-{
-	xfs_trans_ijoin(tp, ip);
-	IHOLD(ip);
-	ip->i_itemp->ili_lock_flags = lock_flags;
-}
-
-/*
  * Transactional inode timestamp update. Requires the inode to be locked and
  * joined to the transaction supplied. Relies on the transaction subsystem to
  * track dirty state and update/writeback the inode accordingly.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b9723..44820b9fcb4 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,18 @@ struct xfs_ail_cursor {
  */
 struct xfs_ail {
 	struct xfs_mount	*xa_mount;
+	struct task_struct	*xa_task;
 	struct list_head	xa_ail;
 	xfs_lsn_t		xa_target;
 	struct list_head	xa_cursors;
 	spinlock_t		xa_lock;
-	struct delayed_work	xa_work;
 	xfs_lsn_t		xa_last_pushed_lsn;
-	unsigned long		xa_flags;
+	int			xa_log_flush;
 };
 
-#define XFS_AIL_PUSHING_BIT	0
-
 /*
  * From xfs_trans_ail.c
  */
-
-extern struct workqueue_struct	*xfs_ail_wq;	/* AIL workqueue */
-
 void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
 				struct xfs_ail_cursor *cur,
 				struct xfs_log_item **log_items, int nr_items,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 51fc429527b..4ecf2a54906 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -72,8 +72,8 @@ xfs_readlink_bmap(
 	xfs_buf_t	*bp;
 	int		error = 0;
 
-	error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
-			mval, &nmaps, NULL);
+	error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
+			       0);
 	if (error)
 		goto out;
 
@@ -87,8 +87,7 @@ xfs_readlink_bmap(
 			return XFS_ERROR(ENOMEM);
 		error = bp->b_error;
 		if (error) {
-			xfs_ioerror_alert("xfs_readlink",
-				  ip->i_mount, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp, __func__);
 			xfs_buf_relse(bp);
 			goto out;
 		}
@@ -178,8 +177,7 @@ xfs_free_eofblocks(
 
 	nimaps = 1;
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-			  NULL, 0, &imap, &nimaps, NULL);
+	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!error && (nimaps != 0) &&
@@ -220,7 +218,7 @@ xfs_free_eofblocks(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		error = xfs_itruncate_data(&tp, ip, ip->i_size);
 		if (error) {
@@ -289,7 +287,7 @@ xfs_inactive_symlink_rmt(
 	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 	size = (int)ip->i_d.di_size;
 	ip->i_d.di_size = 0;
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Find the block(s) so we can inval and unmap them.
@@ -297,9 +295,9 @@ xfs_inactive_symlink_rmt(
 	done = 0;
 	xfs_bmap_init(&free_list, &first_block);
 	nmaps = ARRAY_SIZE(mval);
-	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
-			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-			&free_list)))
+	error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
+				mval, &nmaps, 0);
+	if (error)
 		goto error0;
 	/*
 	 * Invalidate the block(s).
@@ -308,6 +306,10 @@ xfs_inactive_symlink_rmt(
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
 			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		if (!bp) {
+			error = ENOMEM;
+			goto error1;
+		}
 		xfs_trans_binval(tp, bp);
 	}
 	/*
@@ -333,7 +335,7 @@ xfs_inactive_symlink_rmt(
 	 * Mark it dirty so it will be logged and moved forward in the log as
 	 * part of every commit.
 	 */
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Get a new, empty transaction to return to our caller.
@@ -466,7 +468,7 @@ xfs_inactive_attrs(
 		goto error_cancel;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 
 	ASSERT(ip->i_d.di_anextents == 0);
@@ -647,8 +649,6 @@ xfs_inactive(
 	if (truncate) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-		xfs_ioend_wait(ip);
-
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_ITRUNCATE_LOG_RES(mp),
 					  0, XFS_TRANS_PERM_LOG_RES,
@@ -662,7 +662,7 @@ xfs_inactive(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		error = xfs_itruncate_data(&tp, ip, 0);
 		if (error) {
@@ -686,7 +686,7 @@ xfs_inactive(
 			return VN_INACTIVE_CACHE;
 		}
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 	} else {
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_IFREE_LOG_RES(mp),
@@ -699,7 +699,7 @@ xfs_inactive(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 	}
 
 	/*
@@ -939,7 +939,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1260,8 +1260,8 @@ xfs_remove(
 
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we're removing a directory perform some additional validation.
@@ -1406,8 +1406,8 @@ xfs_link(
 
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
 	/*
 	 * If the source has too many links, we can't make any more to it.
@@ -1601,7 +1601,7 @@ xfs_symlink(
 	 * transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
 	/*
@@ -1632,10 +1632,9 @@ xfs_symlink(
 		first_fsb = 0;
 		nmaps = SYMLINK_MAPS;
 
-		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
-				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
-				  &first_block, resblks, mval, &nmaps,
-				  &free_list);
+		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_METADATA, &first_block, resblks,
+				  mval, &nmaps, &free_list);
 		if (error)
 			goto error2;
 
@@ -1650,7 +1649,10 @@ xfs_symlink(
 			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					       BTOBB(byte_cnt), 0);
-			ASSERT(!xfs_buf_geterror(bp));
+			if (!bp) {
+				error = ENOMEM;
+				goto error2;
+			}
 			if (pathlen < byte_cnt) {
 				byte_cnt = pathlen;
 			}
@@ -1732,7 +1734,7 @@ xfs_set_dmattrs(
 		return error;
 	}
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	ip->i_d.di_dmevmask = evmask;
 	ip->i_d.di_dmstate  = state;
@@ -1778,7 +1780,6 @@ xfs_alloc_file_space(
 	xfs_fileoff_t		startoffset_fsb;
 	xfs_fsblock_t		firstfsb;
 	int			nimaps;
-	int			bmapi_flag;
 	int			quota_flag;
 	int			rt;
 	xfs_trans_t		*tp;
@@ -1806,7 +1807,6 @@ xfs_alloc_file_space(
 	count = len;
 	imapp = &imaps[0];
 	nimaps = 1;
-	bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
@@ -1877,16 +1877,12 @@ xfs_alloc_file_space(
 		if (error)
 			goto error1;
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
-		/*
-		 * Issue the xfs_bmapi() call to allocate the blocks
-		 */
 		xfs_bmap_init(&free_list, &firstfsb);
-		error = xfs_bmapi(tp, ip, startoffset_fsb,
-				  allocatesize_fsb, bmapi_flag,
-				  &firstfsb, 0, imapp, &nimaps,
-				  &free_list);
+		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+					allocatesize_fsb, alloc_type, &firstfsb,
+					0, imapp, &nimaps, &free_list);
 		if (error) {
 			goto error0;
 		}
@@ -1976,8 +1972,7 @@ xfs_zero_remaining_bytes(
 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
-			NULL, 0, &imap, &nimap, NULL);
+		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
 		if (error || nimap < 1)
 			break;
 		ASSERT(imap.br_blockcount >= 1);
@@ -1997,8 +1992,8 @@ xfs_zero_remaining_bytes(
 		xfsbdstrat(mp, bp);
 		error = xfs_buf_iowait(bp);
 		if (error) {
-			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
-					  mp, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp,
+					"xfs_zero_remaining_bytes(read)");
 			break;
 		}
 		memset(bp->b_addr +
@@ -2010,8 +2005,8 @@ xfs_zero_remaining_bytes(
 		xfsbdstrat(mp, bp);
 		error = xfs_buf_iowait(bp);
 		if (error) {
-			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
-					  mp, bp, XFS_BUF_ADDR(bp));
+			xfs_buf_ioerror_alert(bp,
+					"xfs_zero_remaining_bytes(write)");
 			break;
 		}
 	}
@@ -2076,7 +2071,7 @@ xfs_free_file_space(
 	if (need_iolock) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 		/* wait for the completion of any pending DIOs */
-		xfs_ioend_wait(ip);
+		inode_dio_wait(VFS_I(ip));
 	}
 
 	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -2096,8 +2091,8 @@ xfs_free_file_space(
 	 */
 	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, startoffset_fsb,
-			1, 0, NULL, 0, &imap, &nimap, NULL);
+		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+					&imap, &nimap, 0);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -2111,8 +2106,8 @@ xfs_free_file_space(
 				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
 		}
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
-			1, 0, NULL, 0, &imap, &nimap, NULL);
+		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+					&imap, &nimap, 0);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -2180,7 +2175,7 @@ xfs_free_file_space(
 		if (error)
 			goto error1;
 
-		xfs_trans_ijoin(tp, ip);
+		xfs_trans_ijoin(tp, ip, 0);
 
 		/*
 		 * issue the bunmapi() call to free the blocks
@@ -2353,8 +2348,7 @@ xfs_change_file_space(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, ip);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	if ((attr_flags & XFS_ATTR_DMI) == 0) {
 		ip->i_d.di_mode &= ~S_ISUID;
@@ -2379,10 +2373,5 @@ xfs_change_file_space(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	if (attr_flags & XFS_ATTR_SYNC)
 		xfs_trans_set_sync(tp);
-
-	error = xfs_trans_commit(tp, 0);
-
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	return error;
+	return xfs_trans_commit(tp, 0);
 }