156 files changed, 4940 insertions, 2519 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0d..32ef4009d03 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
 extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfce..d61e3b28ce3 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 };
+
+const struct file_operations v9fs_dir_operations_dotl = {
+	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
+	.readdir = v9fs_dir_readdir,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a..25b300e1c9d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -296,3 +296,14 @@ const struct file_operations v9fs_file_operations = {
 	.mmap = generic_file_readonly_mmap,
 	.fsync = v9fs_file_fsync,
 };
+
+const struct file_operations v9fs_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_file_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 6d4d86187c5..4331b3b5ee1 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
 #include "cache.h"
 
 static const struct inode_operations v9fs_dir_inode_operations;
-static const struct inode_operations v9fs_dir_inode_operations_ext;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 
 /**
  * unixmode2p9mode - convert unix mode bits to plan 9
@@ -273,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 		break;
 	case S_IFREG:
-		inode->i_op = &v9fs_file_inode_operations;
-		inode->i_fop = &v9fs_file_operations;
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+			inode->i_fop = &v9fs_file_operations_dotl;
+		} else {
+			inode->i_op = &v9fs_file_inode_operations;
+			inode->i_fop = &v9fs_file_operations;
+		}
+
 		break;
+
 	case S_IFLNK:
-		if (!v9fs_proto_dotu(v9ses)) {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				   "extended modes used w/o 9P2000.u\n");
+		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
+			P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
+						"legacy protocol.\n");
 			err = -EINVAL;
 			goto error;
 		}
-		inode->i_op = &v9fs_symlink_inode_operations;
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_symlink_inode_operations_dotl;
+		else
+			inode->i_op = &v9fs_symlink_inode_operations;
+
 		break;
 	case S_IFDIR:
 		inc_nlink(inode);
-		if (v9fs_proto_dotu(v9ses))
-			inode->i_op = &v9fs_dir_inode_operations_ext;
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotl;
+		else if (v9fs_proto_dotu(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotu;
 		else
 			inode->i_op = &v9fs_dir_inode_operations;
-		inode->i_fop = &v9fs_dir_operations;
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_fop = &v9fs_dir_operations_dotl;
+		else
+			inode->i_fop = &v9fs_dir_operations;
+
 		break;
 	default:
 		P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -432,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
 	int retval;
 	struct inode *file_inode;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *v9fid;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
 		rmdir);
 
 	file_inode = file->d_inode;
-	v9ses = v9fs_inode2v9ses(file_inode);
 	v9fid = v9fs_fid_clone(file);
 	if (IS_ERR(v9fid))
 		return PTR_ERR(v9fid);
@@ -482,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	ofid = NULL;
 	fid = NULL;
 	name = (char *) dentry->d_name.name;
-	dfid = v9fs_fid_clone(dentry->d_parent);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
-		dfid = NULL;
-		goto error;
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return ERR_PTR(err);
 	}
 
 	/* clone a fid to use for creation */
@@ -495,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
 		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		ofid = NULL;
-		goto error;
+		return ERR_PTR(err);
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -506,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	}
 
 	/* now walk from the parent so we can get unopened fid */
-	fid = p9_client_walk(dfid, 1, &name, 0);
+	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
-	} else
-		dfid = NULL;
+	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
 	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -536,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	return ofid;
 
 error:
-	if (dfid)
-		p9_client_clunk(dfid);
-
 	if (ofid)
 		p9_client_clunk(ofid);
 
@@ -673,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(fid)) {
 		result = PTR_ERR(fid);
 		if (result == -ENOENT) {
-			d_add(dentry, NULL);
-			return NULL;
+			inode = NULL;
+			goto inst_out;
 		}
 
 		return ERR_PTR(result);
@@ -691,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (result < 0)
 		goto error;
 
-	if ((fid->qid.version) && (v9ses->cache))
+inst_out:
+	if (v9ses->cache)
 		dentry->d_op = &v9fs_cached_dentry_operations;
 	else
 		dentry->d_op = &v9fs_dentry_operations;
@@ -770,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto clunk_olddir;
 	}
 
+	if (v9fs_proto_dotl(v9ses)) {
+		retval = p9_client_rename(oldfid, newdirfid,
+					(char *) new_dentry->d_name.name);
+		if (retval != -ENOSYS)
+			goto clunk_newdir;
+	}
+
 	/* 9P can only handle file rename in the same directory */
 	if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
 		P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1195,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISFIFO(mode))
 		*name = 0;
+	else if (S_ISSOCK(mode))
+		*name = 0;
 	else {
 		__putname(name);
 		return -EINVAL;
@@ -1206,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return retval;
 }
 
-static const struct inode_operations v9fs_dir_inode_operations_ext = {
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.symlink = v9fs_vfs_symlink,
+	.link = v9fs_vfs_link,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
 	.symlink = v9fs_vfs_symlink,
@@ -1237,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
 	.setattr = v9fs_vfs_setattr,
 };
 
+static const struct inode_operations v9fs_file_inode_operations_dotl = {
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
 	.follow_link = v9fs_vfs_follow_link,
@@ -1244,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
 };
+
+static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a..be74d020436 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/statfs.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -45,7 +46,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 
-static const struct super_operations v9fs_super_ops;
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 
 /**
  * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
 	sb->s_blocksize = 1 << sb->s_blocksize_bits;
 	sb->s_magic = V9FS_MAGIC;
-	sb->s_op = &v9fs_super_ops;
+	if (v9fs_proto_dotl(v9ses))
+		sb->s_op = &v9fs_super_ops_dotl;
+	else
+		sb->s_op = &v9fs_super_ops;
 	sb->s_bdi = &v9ses->bdi;
 
 	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
 	v9fs_session_begin_cancel(v9ses);
 }
 
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_rstatfs rs;
+	int res;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		res = PTR_ERR(fid);
+		goto done;
+	}
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	if (v9fs_proto_dotl(v9ses)) {
+		res = p9_client_statfs(fid, &rs);
+		if (res == 0) {
+			buf->f_type = rs.type;
+			buf->f_bsize = rs.bsize;
+			buf->f_blocks = rs.blocks;
+			buf->f_bfree = rs.bfree;
+			buf->f_bavail = rs.bavail;
+			buf->f_files = rs.files;
+			buf->f_ffree = rs.ffree;
+			buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+			buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+			buf->f_namelen = rs.namelen;
+		}
+		if (res != -ENOSYS)
+			goto done;
+	}
+	res = simple_statfs(dentry, buf);
+done:
+	return res;
+}
+
 static const struct super_operations v9fs_super_ops = {
 #ifdef CONFIG_9P_FSCACHE
 	.alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
 	.umount_begin = v9fs_umount_begin,
 };
 
+static const struct super_operations v9fs_super_ops_dotl = {
+#ifdef CONFIG_9P_FSCACHE
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+#endif
+	.statfs = v9fs_statfs,
+	.clear_inode = v9fs_clear_inode,
+	.show_options = generic_show_options,
+	.umount_begin = v9fs_umount_begin,
+};
+
 struct file_system_type v9fs_fs_type = {
 	.name = "9p",
 	.get_sb = v9fs_get_sb,
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7..914d1c0bc07 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		affs_brelse(bh);
 		inode = affs_iget(sb, ino);
 		if (IS_ERR(inode))
-			return ERR_PTR(PTR_ERR(inode));
+			return ERR_CAST(inode);
 	}
 	dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
 	d_add(dentry, inode);
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83..48fdeebdb54 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
 #include <linux/hash.h>
+#include <linux/compat.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
 	return ret;
 }
 
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 {
 	ssize_t ret;
 
-	ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
-				    kiocb->ki_nbytes, 1,
-				    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+#ifdef CONFIG_COMPAT
+	if (compat)
+		ret = compat_rw_copy_check_uvector(type,
+				(struct compat_iovec __user *)kiocb->ki_buf,
+				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				&kiocb->ki_iovec);
+	else
+#endif
+		ret = rw_copy_check_uvector(type,
+				(struct iovec __user *)kiocb->ki_buf,
+				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				&kiocb->ki_iovec);
 	if (ret < 0)
 		goto out;
 
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
  *	Performs the initial checks and aio retry method
  *	setup for the kiocb at the time of io submission.
  */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 {
 	struct file *file = kiocb->ki_filp;
 	ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 		ret = security_file_permission(file, MAY_READ);
 		if (unlikely(ret))
 			break;
-		ret = aio_setup_vectored_rw(READ, kiocb);
+		ret = aio_setup_vectored_rw(READ, kiocb, compat);
 		if (ret)
 			break;
 		ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 		ret = security_file_permission(file, MAY_WRITE);
 		if (unlikely(ret))
 			break;
-		ret = aio_setup_vectored_rw(WRITE, kiocb);
+		ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
 		if (ret)
 			break;
 		ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct hlist_head *batch_hash)
+			 struct iocb *iocb, struct hlist_head *batch_hash,
+			 bool compat)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
 	req->ki_opcode = iocb->aio_lio_opcode;
 
-	ret = aio_setup_iocb(req);
+	ret = aio_setup_iocb(req, compat);
 
 	if (ret)
 		goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
 	return ret;
 }
 
-/* sys_io_submit:
- *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
- *	the number of iocbs queued.  May return -EINVAL if the aio_context
- *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *	*iocbpp[0] is not properly initialized, if the operation specified
- *	is invalid for the file descriptor in the iocb.  May fail with
- *	-EFAULT if any of the data structures point to invalid data.  May
- *	fail with -EBADF if the file descriptor specified in the first
- *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *	fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-		struct iocb __user * __user *, iocbpp)
+long do_io_submit(aio_context_t ctx_id, long nr,
+		  struct iocb __user *__user *iocbpp, bool compat)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
 		if (ret)
 			break;
 	}
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 	return i ? i : ret;
 }
 
+/* sys_io_submit:
+ *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *	the number of iocbs queued.  May return -EINVAL if the aio_context
+ *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *	*iocbpp[0] is not properly initialized, if the operation specified
+ *	is invalid for the file descriptor in the iocb.  May fail with
+ *	-EFAULT if any of the data structures point to invalid data.  May
+ *	fail with -EBADF if the file descriptor specified in the first
+ *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *	fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
+{
+	return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
+
 /* lookup_kiocb
  *	Finds a given iocb for cancellation.
  */
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc7..9a0520b5066 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 
 const struct file_operations autofs_root_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= autofs_root_readdir,
 	.ioctl		= autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df86..ba4a38b9c22 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
  */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-	struct autofs_dev_ioctl tmp, *ads;
+	struct autofs_dev_ioctl tmp;
 
 	if (copy_from_user(&tmp, in, sizeof(tmp)))
 		return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
 	if (tmp.size < sizeof(tmp))
 		return ERR_PTR(-EINVAL);
 
-	ads = kmalloc(tmp.size, GFP_KERNEL);
-	if (!ads)
-		return ERR_PTR(-ENOMEM);
-
-	if (copy_from_user(ads, in, tmp.size)) {
-		kfree(ads);
-		return ERR_PTR(-EFAULT);
-	}
-
-	return ads;
+	return memdup_user(in, tmp.size);
 }
 
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
-	.minor 		= MISC_DYNAMIC_MINOR,
+	.minor		= AUTOFS_MINOR,
 	.name  		= AUTOFS_DEVICE_NAME,
 	.fops  		= &_dev_ioctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
+
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac95..db4117ed780 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/smp_lock.h>
 #include "autofs_i.h"
 
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
-static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
 	.read		= generic_read_dir,
 	.readdir	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
-	.ioctl		= autofs4_root_ioctl,
+	.unlocked_ioctl	= autofs4_root_ioctl,
 };
 
 const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
  * ioctl()'s on the root directory is the chief method for the daemon to
  * generate kernel reactions
  */
-static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg)
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
+				       unsigned int cmd, unsigned long arg)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
 	void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
 		return -ENOSYS;
 	}
 }
+
+static long autofs4_root_ioctl(struct file *filp,
+			       unsigned int cmd, unsigned long arg)
+{
+	long ret;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	lock_kernel();
+	ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+	unlock_kernel();
+
+	return ret;
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538..2909a03e523 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -832,11 +832,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 
 static struct miscdevice btrfs_misc = {
-	.minor		= MISC_DYNAMIC_MINOR,
+	.minor		= BTRFS_MINOR,
 	.name		= "btrfs-control",
 	.fops		= &btrfs_ctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
 static int btrfs_interface_init(void)
 {
 	return misc_register(&btrfs_misc);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed..d9c60b84949 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
 	int rc = 0;
 	struct page **pages;
-	struct pagevec pvec;
 	loff_t offset;
 	u64 len;
 
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 	if (rc < 0)
 		goto out;
 
-	/* set uptodate and add to lru in pagevec-sized chunks */
-	pagevec_init(&pvec, 0);
 	for (; !list_empty(page_list) && len > 0;
 	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
 		struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 			zero_user_segment(page, s, PAGE_CACHE_SIZE);
 		}
 
-		if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+		if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
 			page_cache_release(page);
 			dout("readpages %p add_to_page_cache failed %p\n",
 			     inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 		unlock_page(page);
-		if (pagevec_add(&pvec, page) == 0)
-			pagevec_lru_add_file(&pvec);   /* add to lru */
+		page_cache_release(page);
 	}
-	pagevec_lru_add_file(&pvec);
 	rc = 0;
 
 out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	ceph_release_pages(req->r_pages, req->r_num_pages);
 	if (req->r_pages_from_pool)
 		mempool_free(req->r_pages,
-			     ceph_client(inode->i_sb)->wb_pagevec_pool);
+			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
 	else
 		kfree(req->r_pages);
 	ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c..9f46de2ba7a 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
 
 	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
 	if (ret < 0) {
-		pr_err("error %d building request\n", ret);
+		pr_err("error %d building auth method %s request\n", ret,
+		       ac->ops->name);
 		return ret;
 	}
 	dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		if (ac->protocol != protocol) {
 			ret = ceph_auth_init_protocol(ac, protocol);
 			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
+				pr_err("error %d on auth method %s init\n",
+				       ret, ac->ops->name);
 				goto out;
 			}
 		}
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 	if (ret == -EAGAIN) {
 		return ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
-		pr_err("authentication error %d\n", ret);
+		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
 		return ret;
 	}
 	return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb26..4429a707c02 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
 struct ceph_authorizer;
 
 struct ceph_auth_client_ops {
+	const char *name;
+
 	/*
 	 * true if we are authenticated and can connect to
 	 * services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f..24407c11929 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.name = "none",
 	.reset = reset,
 	.destroy = destroy,
 	.is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da88..7b206231566 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 	int ret;
 	char *dbuf;
 	char *ticket_buf;
-	u8 struct_v;
+	u8 reply_struct_v;
 
 	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
 	if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		goto out_dbuf;
 
 	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-	struct_v = ceph_decode_8(&p);
-	if (struct_v != 1)
+	reply_struct_v = ceph_decode_8(&p);
+	if (reply_struct_v != 1)
 		goto bad;
 	num = ceph_decode_32(&p);
 	dout("%d tickets\n", num);
 	while (num--) {
 		int type;
-		u8 struct_v;
+		u8 tkt_struct_v, blob_struct_v;
 		struct ceph_x_ticket_handler *th;
 		void *dp, *dend;
 		int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		type = ceph_decode_32(&p);
 		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
 
-		struct_v = ceph_decode_8(&p);
-		if (struct_v != 1)
+		tkt_struct_v = ceph_decode_8(&p);
+		if (tkt_struct_v != 1)
 			goto bad;
 
 		th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		dend = dbuf + dlen;
 		dp = dbuf;
 
-		struct_v = ceph_decode_8(&dp);
-		if (struct_v != 1)
+		tkt_struct_v = ceph_decode_8(&dp);
+		if (tkt_struct_v != 1)
 			goto bad;
 
 		memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		tpend = tp + dlen;
 		dout(" ticket blob is %d bytes\n", dlen);
 		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-		struct_v = ceph_decode_8(&tp);
+		blob_struct_v = ceph_decode_8(&tp);
 		new_secret_id = ceph_decode_64(&tp);
 		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
 		if (ret)
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 
 
 static const struct ceph_auth_client_ops ceph_x_ops = {
+	.name = "x",
 	.is_authenticated = ceph_x_is_authenticated,
 	.build_request = ceph_x_build_request,
 	.handle_reply = ceph_x_handle_reply,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b27..0dd0b81e64f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 {
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
-	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct ceph_mds_client *mdsc =
+		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	int removed = 0;
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	     seq, issue_seq, mseq, follows, size, max_size,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+	if (!msg)
+		return -ENOMEM;
 
 	msg->hdr.tid = cpu_to_le64(flush_tid);
 
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  */
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct ceph_mds_client *mdsc =
+		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	int was = ci->i_dirty_caps;
 	int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing;
 
@@ -1663,7 +1665,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 			  unsigned *flush_tid)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
 static int caps_are_flushed(struct inode *inode, unsigned tid)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int dirty, i, ret = 1;
+	int i, ret = 1;
 
 	spin_lock(&inode->i_lock);
-	dirty = __ceph_caps_dirty(ci);
 	for (i = 0; i < CEPH_CAP_BITS; i++)
 		if ((ci->i_flushing_caps & (1 << i)) &&
 		    ci->i_cap_flush_tid[i] <= tid) {
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 			err = wait_event_interruptible(ci->i_cap_wq,
 				       caps_are_flushed(inode, flush_tid));
 	} else {
-		struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&inode->i_lock);
 		if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	__releases(inode->i_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef365..3b9eeed097b 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
  * Ceph release version
  */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
 #define CEPH_VERSION_PATCH 0
 
 #define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
  * client-facing protocol.
  */
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -53,8 +53,18 @@
 /*
  * feature bits
  */
-#define CEPH_FEATURE_SUPPORTED  0
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_UID        1
+#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
+
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
 
 
 /*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
 
 /*********************************************
  * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_SNAP            0x312
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
 
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
 /* osd */
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
 
+/* pool operations */
+enum {
+  POOL_OP_CREATE			= 0x01,
+  POOL_OP_DELETE			= 0x02,
+  POOL_OP_AUID_CHANGE			= 0x03,
+  POOL_OP_CREATE_SNAP			= 0x11,
+  POOL_OP_DELETE_SNAP			= 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
+};
+
 struct ceph_mon_request_header {
 	__le64 have_version;
 	__le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
 	struct ceph_statfs st;
 } __attribute__ ((packed));
 
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 pool;
+	__le32 op;
+	__le64 auid;
+	__le64 snapid;
+	__le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 reply_code;
+	__le32 epoch;
+	char has_data;
+	char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+	__le64 snapid;
+} __attribute__ ((packed));
+
 struct ceph_osd_getmap {
 	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
@@ -308,6 +361,7 @@ union ceph_mds_request_args {
 	struct {
 		__le32 frag;                 /* which dir fragment */
 		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c6..7503aee828c 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
 	case CEPH_ENTITY_TYPE_OSD: return "osd";
 	case CEPH_ENTITY_TYPE_MON: return "mon";
 	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
 	case CEPH_ENTITY_TYPE_AUTH: return "auth";
 	default: return "unknown";
 	}
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
 	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
 	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
 	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
 
 	case CEPH_OSD_OP_PULL: return "pull";
 	case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
 	}
 	return "???";
 }
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92ac..3be33fb066c 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 static int monc_show(struct seq_file *s, void *p)
 {
 	struct ceph_client *client = s->private;
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct ceph_mon_client *monc = &client->monc;
 	struct rb_node *rp;
 
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
 	if (monc->want_next_osdmap)
 		seq_printf(s, "want next osdmap\n");
 
-	for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
-		req = rb_entry(rp, struct ceph_mon_statfs_request, node);
-		seq_printf(s, "%lld statfs\n", req->tid);
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
 	}
 
 	mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed2..4fd30900eff 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
 		return -ENOMEM;          /* oh well */
 
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_fsdata) /* lost a race */
+	if (dentry->d_fsdata) {
+		/* lost a race */
+		kmem_cache_free(ceph_dentry_cachep, di);
 		goto out_unlock;
+	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
 	dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
 	dentry = list_entry(p, struct dentry, d_u.d_child);
 	di = ceph_dentry(dentry);
 	while (1) {
-		dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+		dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+		     d_unhashed(dentry) ? "!hashed" : "hashed",
 		     parent->d_subdirs.prev, parent->d_subdirs.next);
 		if (p == &parent->d_subdirs) {
 			fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
 	const int max_entries = client->mount_args->max_readdir;
+	const int max_bytes = client->mount_args->max_readdir_bytes;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
 	if (fi->at_end)
@@ -312,6 +317,7 @@ more:
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
 		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+		req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
 		req->r_num_caps = max_entries + 1;
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 		if (err < 0) {
@@ -335,7 +341,7 @@ more:
 		if (req->r_reply_info.dir_end) {
 			kfree(fi->last_name);
 			fi->last_name = NULL;
-			fi->next_offset = 0;
+			fi->next_offset = 2;
 		} else {
 			rinfo = &req->r_reply_info;
 			err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				  struct dentry *dentry, int err)
 {
-	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
 	struct inode *parent = dentry->d_parent->d_inode;
 
 	/* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		    !is_root_ceph_dentry(dir, dentry) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-			di->offset = ci->i_max_offset++;
 			spin_unlock(&dir->i_lock);
 			dout(" dir %p complete, -ENOENT\n", dir);
 			d_add(dentry, NULL);
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		/* ensure target dentry is invalidated, despite
 		   rehashing bug in vfs_rename_dir */
-		new_dentry->d_time = jiffies;
-		ceph_dentry(new_dentry)->lease_shared_gen = 0;
+		ceph_invalidate_dentry_lease(new_dentry);
 	}
 	ceph_mdsc_put_request(req);
 	return err;
 }
 
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	ceph_dentry(dentry)->lease_shared_gen = 0;
+	spin_unlock(&dentry->d_lock);
+}
 
 /*
  * Check if dentry lease is valid.  If not, delete the lease.  Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 
-	dout("d_revalidate %p '%.*s' inode %p\n", dentry,
-	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+	     ceph_dentry(dentry)->offset);
 
 	/* always trust cached snapped dentries, snapdir dentry */
 	if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
 
-	if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+	if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!cf->dir_info) {
@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_add_tail(&di->lru, &mdsc->dentry_lru);
 		mdsc->num_dentry++;
@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 	struct ceph_dentry_info *di = ceph_dentry(dn);
 	struct ceph_mds_client *mdsc;
 
-	dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
-	     dn->d_name.len, dn->d_name.name);
+	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
+	     dn->d_name.len, dn->d_name.name, di->offset);
 	if (di) {
-		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_move_tail(&di->lru, &mdsc->dentry_lru);
 		spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_del_init(&di->lru);
 		mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb32..17447644d67 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 
 	dentry = d_obtain_alias(inode);
-	if (!dentry) {
+	if (IS_ERR(dentry)) {
 		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
 		       fh->ino, inode);
 		iput(inode);
-		return ERR_PTR(-ENOMEM);
+		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
 				      struct ceph_nfs_confh *cfh)
 {
-	struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
 	struct dentry *dentry;
 	struct ceph_vino vino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
 	}
 
 	dentry = d_obtain_alias(inode);
-	if (!dentry) {
+	if (IS_ERR(dentry)) {
 		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
 		       cfh->ino, inode);
 		iput(inode);
-		return ERR_PTR(-ENOMEM);
+		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 
 	dentry = d_obtain_alias(inode);
-	if (!dentry) {
+	if (IS_ERR(dentry)) {
 		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
 		       cfh->ino, inode);
 		iput(inode);
-		return ERR_PTR(-ENOMEM);
+		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d634938edc..6512b6701b9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
  * allocate a vector new pages
  */
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
 	struct page **pages;
 	int i;
 
-	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < num_pages; i++) {
-		pages[i] = alloc_page(GFP_NOFS);
+		pages[i] = __page_cache_alloc(flags);
 		if (pages[i] == NULL) {
 			ceph_release_page_vector(pages, i);
 			return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 		 * in sequence.
 		 */
 	} else {
-		pages = alloc_page_vector(num_pages);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 	}
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    &mtime, false, 2);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
 
 	num_pages = calc_pages_for(pos, len);
 
@@ -668,7 +668,7 @@ more:
 		truncate_inode_pages_range(inode->i_mapping, pos, 
 					   (pos+len) | (PAGE_CACHE_SIZE-1));
 	} else {
-		pages = alloc_page_vector(num_pages);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
 	loff_t endoff = pos + iov->iov_len;
 	int got = 0;
 	int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeb..a81b8b662c7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			&ceph_client(ci->vfs_inode.i_sb)->mdsc;
+			&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
 
 		dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
 			memcpy(ci->i_xattrs.blob->vec.iov_base,
 			       iinfo->xattr_data, iinfo->xattr_len);
 		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		xattr_blob = NULL;
 	}
 
 	inode->i_mapping->a_ops = &ceph_aops;
 	inode->i_mapping->backing_dev_info =
-		&ceph_client(inode->i_sb)->backing_dev_info;
+		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
 		/* set dir completion flag? */
 		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
 		    ceph_snap(inode) == CEPH_NOSNAP &&
-		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
 			ci->i_ceph_flags |= CEPH_I_COMPLETE;
 			ci->i_max_offset = 2;
 		}
 
 		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+		if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
 			inode->i_size = ci->i_rbytes;
 		break;
 	default:
@@ -802,6 +804,37 @@ out_unlock:
 }
 
 /*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+	struct dentry *dir = dn->d_parent;
+	struct inode *inode = dn->d_parent->d_inode;
+	struct ceph_dentry_info *di;
+
+	BUG_ON(!inode);
+
+	di = ceph_dentry(dn);
+
+	spin_lock(&inode->i_lock);
+	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+	di->offset = ceph_inode(inode)->i_max_offset++;
+	spin_unlock(&inode->i_lock);
+
+	spin_lock(&dcache_lock);
+	spin_lock(&dn->d_lock);
+	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
+	spin_unlock(&dn->d_lock);
+	spin_unlock(&dcache_lock);
+}
+
+/*
  * splice a dentry to an inode.
  * caller must hold directory i_mutex for this to be safe.
  *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 {
 	struct dentry *realdn;
 
+	BUG_ON(dn->d_inode);
+
 	/* dn must be unhashed */
 	if (!d_unhashed(dn))
 		d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 		dn = realdn;
 	} else {
 		BUG_ON(!ceph_dentry(dn));
-
 		dout("dn %p attached to %p ino %llx.%llx\n",
 		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
 	}
 	if ((!prehash || *prehash) && d_unhashed(dn))
 		d_rehash(dn);
+	ceph_set_dentry_offset(dn);
 out:
 	return dn;
 }
 
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-	struct dentry *dir = dn->d_parent;
-	struct inode *inode = dn->d_parent->d_inode;
-	struct ceph_dentry_info *di;
-
-	BUG_ON(!inode);
-
-	di = ceph_dentry(dn);
-
-	spin_lock(&inode->i_lock);
-	di->offset = ceph_inode(inode)->i_max_offset++;
-	spin_unlock(&inode->i_lock);
-
-	spin_lock(&dcache_lock);
-	spin_lock(&dn->d_lock);
-	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
-	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
-	spin_unlock(&dn->d_lock);
-	spin_unlock(&dcache_lock);
-}
-
-/*
  * Incorporate results into the local cache.  This is either just
  * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
  * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 
 	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
 		dout("fill_trace reply is empty!\n");
-		if (rinfo->head->result == 0 && req->r_locked_dir) {
-			struct ceph_inode_info *ci =
-				ceph_inode(req->r_locked_dir);
-			dout(" clearing %p complete (empty trace)\n",
-			     req->r_locked_dir);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-			ci->i_release_count++;
-		}
+		if (rinfo->head->result == 0 && req->r_locked_dir)
+			ceph_invalidate_dir_request(req);
 		return 0;
 	}
 
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			     req->r_old_dentry->d_name.len,
 			     req->r_old_dentry->d_name.name,
 			     dn, dn->d_name.len, dn->d_name.name);
+
 			/* ensure target dentry is invalidated, despite
 			   rehashing bug in vfs_rename_dir */
-			dn->d_time = jiffies;
-			ceph_dentry(dn)->lease_shared_gen = 0;
+			ceph_invalidate_dentry_lease(dn);
+
 			/* take overwritten dentry's readdir offset */
+			dout("dn %p gets %p offset %lld (old offset %lld)\n",
+			     req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+			     ceph_dentry(req->r_old_dentry)->offset);
 			ceph_dentry(req->r_old_dentry)->offset =
 				ceph_dentry(dn)->offset;
+
 			dn = req->r_old_dentry;  /* use old_dentry */
 			in = dn->d_inode;
 		}
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 				goto done;
 			}
 			req->r_dentry = dn;  /* may have spliced */
-			ceph_set_dentry_offset(dn);
 			igrab(in);
 		} else if (ceph_ino(in) == vino.ino &&
 			   ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			err = PTR_ERR(dn);
 			goto done;
 		}
-		ceph_set_dentry_offset(dn);
 		req->r_dentry = dn;  /* may have spliced */
 		igrab(in);
 		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
 		       &ci->i_vmtruncate_work)) {
 		dout("ceph_queue_vmtruncate %p\n", inode);
 		igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae6284..d085f07756b 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct ceph_ioctl_dataloc dl;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e0..885aa5710cf 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head);
 
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
 
 
 /*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
-	if (IS_ERR(msg)) {
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
-		return ERR_PTR(PTR_ERR(msg));
+		return NULL;
 	}
 	h = msg->front.iov_base;
 	h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg;
 	int mstate;
 	int mds = session->s_mds;
-	int err = 0;
 
 	/* wait for mds to go active? */
 	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
 
 	/* send connect message */
 	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-	if (IS_ERR(msg)) {
-		err = PTR_ERR(msg);
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
-
-out:
 	return 0;
 }
 
@@ -804,12 +799,49 @@ out:
 }
 
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-				   void *arg)
+				  void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = 0;
+
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
-	ceph_remove_cap(cap);
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	if (!__ceph_is_any_real_caps(ci)) {
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			pr_info(" dropping dirty %s state for %p %lld\n",
+				ceph_cap_string(ci->i_dirty_caps),
+				inode, ceph_ino(inode));
+			ci->i_dirty_caps = 0;
+			list_del_init(&ci->i_dirty_item);
+			drop = 1;
+		}
+		if (!list_empty(&ci->i_flushing_item)) {
+			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+				ceph_cap_string(ci->i_flushing_caps),
+				inode, ceph_ino(inode));
+			ci->i_flushing_caps = 0;
+			list_del_init(&ci->i_flushing_item);
+			mdsc->num_cap_flushing--;
+			drop = 1;
+		}
+		if (drop && ci->i_wrbuffer_ref) {
+			pr_info(" dropping dirty data for %p %lld\n",
+				inode, ceph_ino(inode));
+			ci->i_wrbuffer_ref = 0;
+			ci->i_wrbuffer_ref_head = 0;
+			drop++;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&inode->i_lock);
+	while (drop--)
+		iput(inode);
 	return 0;
 }
 
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
 	dout("remove_session_caps on %p\n", session);
 	iterate_session_caps(session, remove_session_caps_cb, NULL);
 	BUG_ON(session->s_nr_caps > 0);
+	BUG_ON(!list_empty(&session->s_cap_flushing));
 	cleanup_cap_releases(session);
 }
 
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 		ceph_mds_state_name(state));
 	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
 				 ++session->s_renew_seq);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
+	if (!msg)
+		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
 	return 0;
 }
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
-	int err = 0;
 
 	dout("request_close_session mds%d state %s seq %lld\n",
 	     session->s_mds, session_state_name(session->s_state),
 	     session->s_seq);
 	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-	if (IS_ERR(msg))
-		err = PTR_ERR(msg);
-	else
-		ceph_con_send(&session->s_con, msg);
-	return err;
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
 }
 
 /*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
 		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-				   0, 0, NULL);
+				   GFP_NOFS);
 		if (!msg)
 			goto out_unlocked;
 		dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg;
 
 	dout("send_cap_releases mds%d\n", session->s_mds);
-	while (1) {
-		spin_lock(&session->s_cap_lock);
-		if (list_empty(&session->s_cap_releases_done))
-			break;
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases_done)) {
 		msg = list_first_entry(&session->s_cap_releases_done,
 				 struct ceph_msg, list_head);
 		list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
 		ceph_con_send(&session->s_con, msg);
+		spin_lock(&session->s_cap_lock);
 	}
 	spin_unlock(&session->s_cap_lock);
 }
 
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	unsigned num;
+
+	dout("discard_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+
+	/* zero out the in-progress message */
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+	head = msg->front.iov_base;
+	num = le32_to_cpu(head->num);
+	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+	head->num = cpu_to_le32(0);
+	session->s_num_cap_releases += num;
+
+	/* requeue completed messages */
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+		     num);
+		session->s_num_cap_releases += num;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		list_add(&msg->list_head, &session->s_cap_releases);
+	}
+
+	spin_unlock(&session->s_cap_lock);
+}
+
 /*
  * requests
  */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&req->r_fill_mutex);
 	req->r_started = jiffies;
 	req->r_resend_mds = -1;
 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
 			len += 1 + temp->d_name.len;
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+			pr_err("build_path corrupt dentry %p\n", dentry);
 			return ERR_PTR(-EINVAL);
 		}
 	}
@@ -1267,7 +1336,7 @@ retry:
 		struct inode *inode = temp->d_inode;
 
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-			dout("build_path_dentry path+%d: %p SNAPDIR\n",
+			dout("build_path path+%d: %p SNAPDIR\n",
 			     pos, temp);
 		} else if (stop_on_nosnap && inode &&
 			   ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
 				break;
 			strncpy(path + pos, temp->d_name.name,
 				temp->d_name.len);
-			dout("build_path_dentry path+%d: %p '%.*s'\n",
-			     pos, temp, temp->d_name.len, path + pos);
 		}
 		if (pos)
 			path[--pos] = '/';
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			pr_err("build_path_dentry corrupt dentry\n");
+			pr_err("build_path corrupt dentry\n");
 			kfree(path);
 			return ERR_PTR(-EINVAL);
 		}
 	}
 	if (pos != 0) {
-		pr_err("build_path_dentry did not end path lookup where "
+		pr_err("build_path did not end path lookup where "
 		       "expected, namelen is %d, pos is %d\n", len, pos);
 		/* presumably this is only possible if racing with a
 		   rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
 
 	*base = ceph_ino(temp->d_inode);
 	*plen = len;
-	dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+	dout("build_path on %p %d built %llx '%.*s'\n",
 	     dentry, atomic_read(&dentry->d_count), *base, len, path);
 	return path;
 }
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (req->r_old_dentry_drop)
 		len += req->r_old_dentry->d_name.len;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
-	if (IS_ERR(msg))
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+	if (!msg) {
+		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
+	}
 
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	}
 	msg = create_request_message(mdsc, req, mds);
 	if (IS_ERR(msg)) {
-		req->r_reply = ERR_PTR(PTR_ERR(msg));
+		req->r_err = PTR_ERR(msg);
 		complete_request(mdsc, req);
-		return -PTR_ERR(msg);
+		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
 
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	int mds = -1;
 	int err = -EAGAIN;
 
-	if (req->r_reply)
+	if (req->r_err || req->r_got_result)
 		goto out;
 
 	if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
 	return err;
 
 finish:
-	req->r_reply = ERR_PTR(err);
+	req->r_err = err;
 	complete_request(mdsc, req);
 	goto out;
 }
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 
 /*
  * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
- * wake up if their requests has been forwarded to @mds, too.
+ * resubmit their requests to a possibly different mds.
  */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
 	struct ceph_mds_request *req;
 	struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	__register_request(mdsc, req, dir);
 	__do_request(mdsc, req);
 
-	/* wait */
-	if (!req->r_reply) {
-		mutex_unlock(&mdsc->mutex);
-		if (req->r_timeout) {
-			err = (long)wait_for_completion_interruptible_timeout(
-				&req->r_completion, req->r_timeout);
-			if (err == 0)
-				req->r_reply = ERR_PTR(-EIO);
-			else if (err < 0)
-				req->r_reply = ERR_PTR(err);
-		} else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-		}
-		mutex_lock(&mdsc->mutex);
+	if (req->r_err) {
+		err = req->r_err;
+		__unregister_request(mdsc, req);
+		dout("do_request early error %d\n", err);
+		goto out;
 	}
 
-	if (IS_ERR(req->r_reply)) {
-		err = PTR_ERR(req->r_reply);
-		req->r_reply = NULL;
+	/* wait */
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request waiting\n");
+	if (req->r_timeout) {
+		err = (long)wait_for_completion_interruptible_timeout(
+			&req->r_completion, req->r_timeout);
+		if (err == 0)
+			err = -EIO;
+	} else {
+		err = wait_for_completion_interruptible(&req->r_completion);
+	}
+	dout("do_request waited, got %d\n", err);
+	mutex_lock(&mdsc->mutex);
 
-		if (err == -ERESTARTSYS) {
-			/* aborted */
-			req->r_aborted = true;
+	/* only abort if we didn't race with a real reply */
+	if (req->r_got_result) {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	} else if (err < 0) {
+		dout("aborted request %lld with %d\n", req->r_tid, err);
 
-			if (req->r_locked_dir &&
-			    (req->r_op & CEPH_MDS_OP_WRITE)) {
-				struct ceph_inode_info *ci =
-					ceph_inode(req->r_locked_dir);
+		/*
+		 * ensure we aren't running concurrently with
+		 * ceph_fill_trace or ceph_readdir_prepopulate, which
+		 * rely on locks (dir mutex) held by our caller.
+		 */
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = err;
+		req->r_aborted = true;
+		mutex_unlock(&req->r_fill_mutex);
 
-				dout("aborted, clearing I_COMPLETE on %p\n", 
-				     req->r_locked_dir);
-				spin_lock(&req->r_locked_dir->i_lock);
-				ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-				ci->i_release_count++;
-				spin_unlock(&req->r_locked_dir->i_lock);
-			}
-		} else {
-			/* clean up this request */
-			__unregister_request(mdsc, req);
-			if (!list_empty(&req->r_unsafe_item))
-				list_del_init(&req->r_unsafe_item);
-			complete(&req->r_safe_completion);
-		}
-	} else if (req->r_err) {
-		err = req->r_err;
+		if (req->r_locked_dir &&
+		    (req->r_op & CEPH_MDS_OP_WRITE))
+			ceph_invalidate_dir_request(req);
 	} else {
-		err = le32_to_cpu(req->r_reply_info.head->result);
+		err = req->r_err;
 	}
-	mutex_unlock(&mdsc->mutex);
 
+out:
+	mutex_unlock(&mdsc->mutex);
 	dout("do_request %p done, result %d\n", req, err);
 	return err;
 }
 
 /*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+	struct inode *inode = req->r_locked_dir;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+	ci->i_release_count++;
+	spin_unlock(&inode->i_lock);
+
+	if (req->r_dentry)
+		ceph_invalidate_dentry_lease(req->r_dentry);
+	if (req->r_old_dentry)
+		ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
+/*
  * Handle mds reply.
  *
  * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
+	if (req->r_got_safe && !head->safe) {
+		pr_warning("got unsafe after safe on %llu from mds%d\n",
+			   tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
 
 	result = le32_to_cpu(head->result);
 
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		}
-	}
-
-	BUG_ON(req->r_reply);
-
-	if (!head->safe) {
+	} else {
 		req->r_got_unsafe = true;
 		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
 	}
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	}
 
 	/* insert trace into our cache */
+	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(&req->r_caps_reservation);
 	}
+	mutex_unlock(&req->r_fill_mutex);
 
 	up_read(&mdsc->snap_rwsem);
 out_err:
-	if (err) {
-		req->r_err = err;
+	mutex_lock(&mdsc->mutex);
+	if (!req->r_aborted) {
+		if (err) {
+			req->r_err = err;
+		} else {
+			req->r_reply = msg;
+			ceph_msg_get(msg);
+			req->r_got_result = true;
+		}
 	} else {
-		req->r_reply = msg;
-		ceph_msg_get(msg);
+		dout("reply arrived after request %lld was aborted\n", tid);
 	}
+	mutex_unlock(&mdsc->mutex);
 
 	add_cap_releases(mdsc, req->r_session, -1);
 	mutex_unlock(&session->s_mutex);
@@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session,
 
 	switch (op) {
 	case CEPH_SESSION_OPEN:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect success\n", session->s_mds);
 		session->s_state = CEPH_MDS_SESSION_OPEN;
 		renewed_caps(mdsc, session, 0);
 		wake = 1;
@@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_SESSION_CLOSE:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect denied\n", session->s_mds);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
 		complete(&mdsc->session_close_waiters);
-		kick_requests(mdsc, mds, 0);      /* cur only */
+		kick_requests(mdsc, mds);
 		break;
 
 	case CEPH_SESSION_STALE:
@@ -2132,54 +2229,44 @@ out:
  *
  * called with mdsc->mutex held.
  */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *session)
 {
-	struct ceph_mds_session *session = NULL;
 	struct ceph_msg *reply;
 	struct rb_node *p;
+	int mds = session->s_mds;
 	int err = -ENOMEM;
 	struct ceph_pagelist *pagelist;
 
-	pr_info("reconnect to recovering mds%d\n", mds);
+	pr_info("mds%d reconnect start\n", mds);
 
 	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
 	if (!pagelist)
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
-	if (IS_ERR(reply)) {
-		err = PTR_ERR(reply);
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+	if (!reply)
 		goto fail_nomsg;
-	}
-
-	/* find session */
-	session = __ceph_lookup_mds_session(mdsc, mds);
-	mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
 
-	if (session) {
-		mutex_lock(&session->s_mutex);
+	mutex_lock(&session->s_mutex);
+	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+	session->s_seq = 0;
 
-		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
-		session->s_seq = 0;
+	ceph_con_open(&session->s_con,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
-		ceph_con_open(&session->s_con,
-			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
-		/* replay unsafe requests */
-		replay_unsafe_requests(mdsc, session);
-	} else {
-		dout("no session for mds%d, will send short reconnect\n",
-		     mds);
-	}
+	/* replay unsafe requests */
+	replay_unsafe_requests(mdsc, session);
 
 	down_read(&mdsc->snap_rwsem);
 
-	if (!session)
-		goto send;
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 
+	/* drop old cap expires; we're about to reestablish that state */
+	discard_cap_releases(mdsc, session);
+
 	/* traverse this session's caps */
 	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
 	if (err)
@@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 			goto fail;
 	}
 
-send:
 	reply->pagelist = pagelist;
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
 	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
 
-	session->s_state = CEPH_MDS_SESSION_OPEN;
 	mutex_unlock(&session->s_mutex);
 
 	mutex_lock(&mdsc->mutex);
 	__wake_requests(mdsc, &session->s_waiting);
 	mutex_unlock(&mdsc->mutex);
 
-	ceph_put_mds_session(session);
-
 	up_read(&mdsc->snap_rwsem);
-	mutex_lock(&mdsc->mutex);
 	return;
 
 fail:
 	ceph_msg_put(reply);
 	up_read(&mdsc->snap_rwsem);
 	mutex_unlock(&session->s_mutex);
-	ceph_put_mds_session(session);
 fail_nomsg:
 	ceph_pagelist_release(pagelist);
 	kfree(pagelist);
 fail_nopagelist:
 	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-	mutex_lock(&mdsc->mutex);
 	return;
 }
 
@@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 			}
 
 			/* kick any requests waiting on the recovering mds */
-			kick_requests(mdsc, i, 1);
+			kick_requests(mdsc, i);
 		} else if (oldstate == newstate) {
 			continue;  /* nothing new with this mds */
 		}
@@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		 * send reconnect?
 		 */
 		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-		    newstate >= CEPH_MDS_STATE_RECONNECT)
-			send_mds_reconnect(mdsc, i);
+		    newstate >= CEPH_MDS_STATE_RECONNECT) {
+			mutex_unlock(&mdsc->mutex);
+			send_mds_reconnect(mdsc, s);
+			mutex_lock(&mdsc->mutex);
+		}
 
 		/*
-		 * kick requests on any mds that has gone active.
-		 *
-		 * kick requests on cur or forwarder: we may have sent
-		 * the request to mds1, mds1 told us it forwarded it
-		 * to mds2, but then we learn mds1 failed and can't be
-		 * sure it successfully forwarded our request before
-		 * it died.
+		 * kick request on any mds that has gone active.
 		 */
 		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
 		    newstate >= CEPH_MDS_STATE_ACTIVE) {
-			pr_info("mds%d reconnect completed\n", s->s_mds);
-			kick_requests(mdsc, i, 1);
+			if (oldstate != CEPH_MDS_STATE_CREATING &&
+			    oldstate != CEPH_MDS_STATE_STARTING)
+				pr_info("mds%d recovery completed\n", s->s_mds);
+			kick_requests(mdsc, i);
 			ceph_kick_flushing_caps(mdsc, s);
 			wake_up_session_caps(s, 1);
 		}
@@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 	dnamelen = dentry->d_name.len;
 	len += dnamelen;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
-	if (IS_ERR(msg))
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+	if (!msg)
 		return;
 	lease = msg->front.iov_base;
 	lease->action = action;
@@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work)
 		else
 			ceph_con_keepalive(&s->s_con);
 		add_cap_releases(mdsc, s, -1);
-		send_cap_releases(mdsc, s);
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG)
+			send_cap_releases(mdsc, s);
 		mutex_unlock(&s->s_mutex);
 		ceph_put_mds_session(s);
 
@@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	mdsc->client = client;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	if (mdsc->mdsmap == NULL)
+		return -ENOMEM;
+
 	init_completion(&mdsc->safe_umount_waiters);
 	init_completion(&mdsc->session_close_waiters);
 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	spin_lock_init(&mdsc->dentry_lru_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
+
 	return 0;
 }
 
@@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
+	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return;
+
 	dout("sync\n");
 	mutex_lock(&mdsc->mutex);
 	want_tid = mdsc->last_tid;
@@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con)
 static void peer_reset(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
 
-	pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
-	       s->s_mds);
+	pr_warning("mds%d closed our session\n", s->s_mds);
+	send_mds_reconnect(mdsc, s);
 }
 
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&mdsc->client->monc);
 }
 
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
 	.dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f6587..d9936c4f121 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
 	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
 	struct inode *r_target_inode;       /* resulting inode */
 
+	struct mutex r_fill_mutex;
+
 	union ceph_mds_request_args r_args;
 	int r_fmode;        /* file mode, if expecting cap */
 
@@ -213,7 +215,7 @@ struct ceph_mds_request {
 	struct completion r_safe_completion;
 	ceph_mds_request_callback_t r_callback;
 	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-	bool		  r_got_unsafe, r_got_safe;
+	bool		  r_got_unsafe, r_got_safe, r_got_result;
 
 	bool              r_did_prepopulate;
 	u32               r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
 				    struct inode *inode,
 				    struct dentry *dn, int mask);
 
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491..60b74839ebe 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
 
-const char *ceph_name_type_str(int t)
-{
-	switch (t) {
-	case CEPH_ENTITY_TYPE_MON: return "mon";
-	case CEPH_ENTITY_TYPE_MDS: return "mds";
-	case CEPH_ENTITY_TYPE_OSD: return "osd";
-	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-	default: return "???";
-	}
-}
-
 /*
  * nicely render a sockaddr as a string.
  */
@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	}
+	con->out_keepalive_pending = false;
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 }
@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
 	clear_bit(WRITE_PENDING, &con->state);
 	mutex_lock(&con->mutex);
 	reset_connection(con);
+	con->peer_global_seq = 0;
 	cancel_delayed_work(&con->work);
 	mutex_unlock(&con->mutex);
 	queue_con(con);
@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
 
-	con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+	con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
 
 static int process_connect(struct ceph_connection *con)
 {
-	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-	u64 req_feat = CEPH_FEATURE_REQUIRED;
+	u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
+	u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
 	u64 server_feat = le64_to_cpu(con->in_reply.features);
 
 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
 		clear_bit(CONNECTING, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
+		con->peer_features = server_feat;
 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
 		     con->peer_global_seq,
 		     le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
 		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
 		if (skip) {
 			/* skip this message */
-			dout("alloc_msg returned NULL, skipping message\n");
+			dout("alloc_msg said skip message\n");
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
 			con->in_seq++;
 			return 0;
 		}
-		if (IS_ERR(con->in_msg)) {
-			ret = PTR_ERR(con->in_msg);
-			con->in_msg = NULL;
+		if (!con->in_msg) {
 			con->error_msg =
 				"error allocating memory for incoming message";
-			return ret;
+			return -ENOMEM;
 		}
 		m = con->in_msg;
 		m->front.iov_len = 0;    /* haven't read it yet */
@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
 
 	/* if first message, set peer_name */
 	if (con->peer_name.type == 0)
-		con->peer_name = msg->hdr.src.name;
+		con->peer_name = msg->hdr.src;
 
 	con->in_seq++;
 	mutex_unlock(&con->mutex);
 
 	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
 	     msg, le64_to_cpu(msg->hdr.seq),
-	     ENTITY_NAME(msg->hdr.src.name),
+	     ENTITY_NAME(msg->hdr.src),
 	     le16_to_cpu(msg->hdr.type),
 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
 	     le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
 	dout("try_write start %p state %lu nref %d\n", con, con->state,
 	     atomic_read(&con->nref));
 
-	mutex_lock(&con->mutex);
 more:
 	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
 
@@ -1639,7 +1627,6 @@ do_next:
 done:
 	ret = 0;
 out:
-	mutex_unlock(&con->mutex);
 	dout("try_write done on %p\n", con);
 	return ret;
 }
@@ -1651,7 +1638,6 @@ out:
  */
 static int try_read(struct ceph_connection *con)
 {
-	struct ceph_messenger *msgr;
 	int ret = -1;
 
 	if (!con->sock)
@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
 		return 0;
 
 	dout("try_read start on %p\n", con);
-	msgr = con->msgr;
-
-	mutex_lock(&con->mutex);
 
 more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1741,6 @@ more:
 done:
 	ret = 0;
 out:
-	mutex_unlock(&con->mutex);
 	dout("try_read done on %p\n", con);
 	return ret;
 
@@ -1830,6 +1812,8 @@ more:
 	dout("con_work %p start, clearing QUEUED\n", con);
 	clear_bit(QUEUED, &con->state);
 
+	mutex_lock(&con->mutex);
+
 	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
 		dout("con_work CLOSED\n");
 		con_close_socket(con);
@@ -1844,11 +1828,16 @@ more:
 	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
 	    try_read(con) < 0 ||
 	    try_write(con) < 0) {
+		mutex_unlock(&con->mutex);
 		backoff = 1;
 		ceph_fault(con);     /* error/fault path */
+		goto done_unlocked;
 	}
 
 done:
+	mutex_unlock(&con->mutex);
+
+done_unlocked:
 	clear_bit(BUSY, &con->state);
 	dout("con->state=%lu\n", con->state);
 	if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 
 	/* the zero page is needed if a request is "canceled" while the message
 	 * is being written over the socket */
-	msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
 	if (!msgr->zero_page) {
 		kfree(msgr);
 		return ERR_PTR(-ENOMEM);
@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	}
 
 	/* set src+dst */
-	msg->hdr.src.name = con->msgr->inst.name;
-	msg->hdr.src.addr = con->msgr->my_enc_addr;
-	msg->hdr.orig_src = msg->hdr.src;
+	msg->hdr.src = con->msgr->inst.name;
 
 	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
 
@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
  */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
-			      int page_len, int page_off, struct page **pages)
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
 {
 	struct ceph_msg *m;
 
-	m = kmalloc(sizeof(*m), GFP_NOFS);
+	m = kmalloc(sizeof(*m), flags);
 	if (m == NULL)
 		goto out;
 	kref_init(&m->kref);
@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	m->hdr.version = 0;
 	m->hdr.front_len = cpu_to_le32(front_len);
 	m->hdr.middle_len = 0;
-	m->hdr.data_len = cpu_to_le32(page_len);
-	m->hdr.data_off = cpu_to_le16(page_off);
+	m->hdr.data_len = 0;
+	m->hdr.data_off = 0;
 	m->hdr.reserved = 0;
 	m->footer.front_crc = 0;
 	m->footer.middle_crc = 0;
@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	/* front */
 	if (front_len) {
 		if (front_len > PAGE_CACHE_SIZE) {
-			m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+			m->front.iov_base = __vmalloc(front_len, flags,
 						      PAGE_KERNEL);
 			m->front_is_vmalloc = true;
 		} else {
-			m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+			m->front.iov_base = kmalloc(front_len, flags);
 		}
 		if (m->front.iov_base == NULL) {
 			pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	m->middle = NULL;
 
 	/* data */
-	m->nr_pages = calc_pages_for(page_off, page_len);
-	m->pages = pages;
+	m->nr_pages = 0;
+	m->pages = NULL;
 	m->pagelist = NULL;
 
-	dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
-	     m->nr_pages);
+	dout("ceph_msg_new %p front %d\n", m, front_len);
 	return m;
 
 out2:
 	ceph_msg_put(m);
 out:
-	pr_err("msg_new can't create type %d len %d\n", type, front_len);
-	return ERR_PTR(-ENOMEM);
+	pr_err("msg_new can't create type %d front %d\n", type, front_len);
+	return NULL;
 }
 
 /*
@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 		mutex_unlock(&con->mutex);
 		msg = con->ops->alloc_msg(con, hdr, skip);
 		mutex_lock(&con->mutex);
-		if (IS_ERR(msg))
-			return msg;
-
-		if (*skip)
+		if (!msg || *skip)
 			return NULL;
 	}
 	if (!msg) {
 		*skip = 0;
-		msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+		msg = ceph_msg_new(type, front_len, GFP_NOFS);
 		if (!msg) {
 			pr_err("unable to allocate msg type %d len %d\n",
 			       type, front_len);
-			return ERR_PTR(-ENOMEM);
+			return NULL;
 		}
 	}
 	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
 
-	if (middle_len) {
+	if (middle_len && !msg->middle) {
 		ret = ceph_alloc_middle(con, msg);
-
 		if (ret < 0) {
 			ceph_msg_put(msg);
-			return msg;
+			return NULL;
 		}
 	}
 
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc97..00a9430b1ff 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
 					int *skip);
 };
 
-extern const char *ceph_name_type_str(int t);
-
 /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 
 struct ceph_messenger {
 	struct ceph_entity_inst inst;    /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_name peer_name; /* peer name */
 	struct ceph_entity_addr peer_addr_for_me;
+	unsigned peer_features;
 	u32 connect_seq;      /* identify the most recent connection
 				 attempt for this connection, client */
 	u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
 	struct list_head out_queue;
 	struct list_head out_sent;   /* sending or sent but unacked */
 	u64 out_seq;		     /* last message queued for send */
-	u64 out_seq_sent;            /* last message sent */
 	bool out_keepalive_pending;
 
 	u64 in_seq, in_seq_acked;  /* last message received, acked */
@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
 
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
-				     int page_len, int page_off,
-				     struct page **pages);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
 extern void ceph_msg_kfree(struct ceph_msg *m);
 
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca95..f6510a476e7 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
  * resend any outstanding requests.
  */
 
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 	monc->pending_auth = 1;
 	monc->m_auth->front.iov_len = len;
 	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_con_revoke(monc->con, monc->m_auth);
 	ceph_msg_get(monc->m_auth);  /* keep our ref */
 	ceph_con_send(monc->con, monc->m_auth);
 }
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 	     monc->want_next_osdmap);
 	if ((__sub_expired(monc) && !monc->sub_sent) ||
 	    monc->want_next_osdmap == 1) {
-		struct ceph_msg *msg;
+		struct ceph_msg *msg = monc->m_subscribe;
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
 
-		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
-		if (!msg)
-			return;
-
 		p = msg->front.iov_base;
-		end = p + msg->front.iov_len;
+		end = p + msg->front_max;
 
 		dout("__send_subscribe to 'mdsmap' %u+\n",
 		     (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		ceph_con_send(monc->con, msg);
+		ceph_con_revoke(monc->con, msg);
+		ceph_con_send(monc->con, ceph_msg_get(msg));
 
 		monc->sub_sent = jiffies | 1;  /* never 0 */
 	}
@@ -353,14 +351,14 @@ out:
 /*
  * statfs
  */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
 	struct ceph_mon_client *monc, u64 tid)
 {
-	struct ceph_mon_statfs_request *req;
-	struct rb_node *n = monc->statfs_request_tree.rb_node;
+	struct ceph_mon_generic_request *req;
+	struct rb_node *n = monc->generic_request_tree.rb_node;
 
 	while (n) {
-		req = rb_entry(n, struct ceph_mon_statfs_request, node);
+		req = rb_entry(n, struct ceph_mon_generic_request, node);
 		if (tid < req->tid)
 			n = n->rb_left;
 		else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
 	return NULL;
 }
 
-static void __insert_statfs(struct ceph_mon_client *monc,
-			    struct ceph_mon_statfs_request *new)
+static void __insert_generic_request(struct ceph_mon_client *monc,
+			    struct ceph_mon_generic_request *new)
 {
-	struct rb_node **p = &monc->statfs_request_tree.rb_node;
+	struct rb_node **p = &monc->generic_request_tree.rb_node;
 	struct rb_node *parent = NULL;
-	struct ceph_mon_statfs_request *req = NULL;
+	struct ceph_mon_generic_request *req = NULL;
 
 	while (*p) {
 		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+		req = rb_entry(parent, struct ceph_mon_generic_request, node);
 		if (new->tid < req->tid)
 			p = &(*p)->rb_left;
 		else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
 	}
 
 	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->statfs_request_tree);
+	rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+	struct ceph_mon_generic_request *req =
+		container_of(kref, struct ceph_mon_generic_request, kref);
+
+	if (req->reply)
+		ceph_msg_put(req->reply);
+	if (req->request)
+		ceph_msg_put(req->request);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+					 struct ceph_msg_header *hdr,
+					 int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	struct ceph_mon_generic_request *req;
+	u64 tid = le64_to_cpu(hdr->tid);
+	struct ceph_msg *m;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (!req) {
+		dout("get_generic_reply %lld dne\n", tid);
+		*skip = 1;
+		m = NULL;
+	} else {
+		dout("get_generic_reply %lld got %p\n", tid, req->reply);
+		m = ceph_msg_get(req->reply);
+		/*
+		 * we don't need to track the connection reading into
+		 * this reply because we only have one open connection
+		 * at a time, ever.
+		 */
+	}
+	mutex_unlock(&monc->mutex);
+	return m;
 }
 
 static void handle_statfs_reply(struct ceph_mon_client *monc,
 				struct ceph_msg *msg)
 {
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-	u64 tid;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
 
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
-	tid = le64_to_cpu(msg->hdr.tid);
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_statfs(monc, tid);
+	req = __lookup_generic_req(monc, tid);
 	if (req) {
-		*req->buf = reply->st;
+		*(struct ceph_statfs *)req->buf = reply->st;
 		req->result = 0;
+		get_generic_request(req);
 	}
 	mutex_unlock(&monc->mutex);
-	if (req)
+	if (req) {
 		complete(&req->completion);
+		put_generic_request(req);
+	}
 	return;
 
 bad:
-	pr_err("corrupt statfs reply, no tid\n");
+	pr_err("corrupt generic reply, no tid\n");
 	ceph_msg_dump(msg);
 }
 
 /*
- * (re)send a statfs request
+ * Do a synchronous statfs().
  */
-static int send_statfs(struct ceph_mon_client *monc,
-		       struct ceph_mon_statfs_request *req)
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
-	struct ceph_msg *msg;
+	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
+	int err;
 
-	dout("send_statfs tid %llu\n", req->tid);
-	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
-	if (IS_ERR(msg))
-		return PTR_ERR(msg);
-	req->request = msg;
-	msg->hdr.tid = cpu_to_le64(req->tid);
-	h = msg->front.iov_base;
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	h = req->request->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
-	ceph_con_send(monc->con, msg);
-	return 0;
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-	struct ceph_mon_statfs_request req;
-	int err;
-
-	req.buf = buf;
-	init_completion(&req.completion);
-
-	/* allocate memory for reply */
-	err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
-	if (err)
-		return err;
 
 	/* register request */
 	mutex_lock(&monc->mutex);
-	req.tid = ++monc->last_tid;
-	req.last_attempt = jiffies;
-	req.delay = BASE_DELAY_INTERVAL;
-	__insert_statfs(monc, &req);
-	monc->num_statfs_requests++;
+	req->tid = ++monc->last_tid;
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	__insert_generic_request(monc, req);
+	monc->num_generic_requests++;
 	mutex_unlock(&monc->mutex);
 
 	/* send request and wait */
-	err = send_statfs(monc, &req);
-	if (!err)
-		err = wait_for_completion_interruptible(&req.completion);
+	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	err = wait_for_completion_interruptible(&req->completion);
 
 	mutex_lock(&monc->mutex);
-	rb_erase(&req.node, &monc->statfs_request_tree);
-	monc->num_statfs_requests--;
-	ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+	rb_erase(&req->node, &monc->generic_request_tree);
+	monc->num_generic_requests--;
 	mutex_unlock(&monc->mutex);
 
 	if (!err)
-		err = req.result;
+		err = req->result;
+
+out:
+	kref_put(&req->kref, release_generic_request);
 	return err;
 }
 
 /*
  * Resend pending statfs requests.
  */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
 {
-	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_generic_request *req;
 	struct rb_node *p;
 
-	for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_mon_statfs_request, node);
-		send_statfs(monc, req);
+	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_generic_request, node);
+		ceph_con_revoke(monc->con, req->request);
+		ceph_con_send(monc->con, ceph_msg_get(req->request));
 	}
 }
 
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
 		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 
-	/* msg pools */
-	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
-			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
-	if (err < 0)
+	/* msgs */
+	err = -ENOMEM;
+	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     GFP_NOFS);
+	if (!monc->m_subscribe_ack)
 		goto out_monmap;
-	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-				sizeof(struct ceph_mon_statfs_reply), 0, false);
-	if (err < 0)
-		goto out_pool1;
-	err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-	if (err < 0)
-		goto out_pool2;
-
-	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	if (!monc->m_subscribe)
+		goto out_subscribe_ack;
+
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+	if (!monc->m_auth_reply)
+		goto out_subscribe;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
 	monc->pending_auth = 0;
-	if (IS_ERR(monc->m_auth)) {
-		err = PTR_ERR(monc->m_auth);
-		monc->m_auth = NULL;
-		goto out_pool3;
-	}
+	if (!monc->m_auth)
+		goto out_auth_reply;
 
 	monc->cur_mon = -1;
 	monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->sub_sent = 0;
 
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-	monc->statfs_request_tree = RB_ROOT;
-	monc->num_statfs_requests = 0;
+	monc->generic_request_tree = RB_ROOT;
+	monc->num_generic_requests = 0;
 	monc->last_tid = 0;
 
 	monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->want_next_osdmap = 1;
 	return 0;
 
-out_pool3:
-	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
-out_pool2:
-	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-out_pool1:
-	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_auth_reply:
+	ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+	ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+	ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
 	kfree(monc->monmap);
 out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	ceph_auth_destroy(monc->auth);
 
 	ceph_msg_put(monc->m_auth);
-	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
-	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+	ceph_msg_put(monc->m_auth_reply);
+	ceph_msg_put(monc->m_subscribe);
+	ceph_msg_put(monc->m_subscribe_ack);
 
 	kfree(monc->monmap);
 }
@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		monc->client->msgr->inst.name.num = monc->auth->global_id;
 
 		__send_subscribe(monc);
-		__resend_statfs(monc);
+		__resend_generic_request(monc);
 	}
 	mutex_unlock(&monc->mutex);
 }
@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 
 	switch (type) {
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+		m = ceph_msg_get(monc->m_subscribe_ack);
 		break;
 	case CEPH_MSG_STATFS_REPLY:
-		m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
-		break;
+		return get_generic_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
-		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+		m = ceph_msg_get(monc->m_auth_reply);
 		break;
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
-		m = ceph_msg_new(type, front_len, 0, 0, NULL);
+		m = ceph_msg_new(type, front_len, GFP_NOFS);
 		break;
 	}
 
@@ -826,7 +867,7 @@ out:
 	mutex_unlock(&monc->mutex);
 }
 
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
 	.get = ceph_con_get,
 	.put = ceph_con_put,
 	.dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa0..174d794321d 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
 #define _FS_CEPH_MON_CLIENT_H
 
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/rbtree.h>
 
 #include "messenger.h"
-#include "msgpool.h"
 
 struct ceph_client;
 struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
 };
 
 struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
 
 
 /*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
 };
 
 /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
  * to the caller
  */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+	struct kref kref;
 	u64 tid;
 	struct rb_node node;
 	int result;
-	struct ceph_statfs *buf;
+	void *buf;
 	struct completion completion;
-	unsigned long last_attempt, delay; /* jiffies */
 	struct ceph_msg *request;  /* original request */
+	struct ceph_msg *reply;    /* and reply */
 };
 
 struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
 	struct delayed_work delayed_work;
 
 	struct ceph_auth_client *auth;
-	struct ceph_msg *m_auth;
+	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
 	int pending_auth;
 
 	bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
 	struct ceph_connection *con;
 	bool have_fsid;
 
-	/* msg pools */
-	struct ceph_msgpool msgpool_subscribe_ack;
-	struct ceph_msgpool msgpool_statfs_reply;
-	struct ceph_msgpool msgpool_auth_reply;
-
-	/* pending statfs requests */
-	struct rb_root statfs_request_tree;
-	int num_statfs_requests;
+	/* pending generic requests */
+	struct rb_root generic_request_tree;
+	int num_generic_requests;
 	u64 last_tid;
 
 	/* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2..dd65a643813 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
 
 #include "msgpool.h"
 
-/*
- * We use msg pools to preallocate memory for messages we expect to
- * receive over the wire, to avoid getting ourselves into OOM
- * conditions at unexpected times.  We take use a few different
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+	struct ceph_msgpool *pool = arg;
+	void *p;
 
+	p = ceph_msg_new(0, pool->front_len, gfp_mask);
+	if (!p)
+		pr_err("msgpool %s alloc failed\n", pool->name);
+	return p;
+}
 
-/*
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
+static void free_fn(void *element, void *arg)
 {
-	struct ceph_msg *msg;
-
-	while (pool->num < pool->min) {
-		dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-		     pool->min);
-		spin_unlock(&pool->lock);
-		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-		spin_lock(&pool->lock);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-		msg->pool = pool;
-		list_add(&msg->list_head, &pool->msgs);
-		pool->num++;
-	}
-	while (pool->num > pool->min) {
-		msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-		dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-		     pool->min, msg);
-		list_del_init(&msg->list_head);
-		pool->num--;
-		ceph_msg_kfree(msg);
-	}
-	return 0;
+	ceph_msg_put(element);
 }
 
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-		      int front_len, int min, bool blocking)
+		      int front_len, int size, bool blocking, const char *name)
 {
-	int ret;
-
-	dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-	spin_lock_init(&pool->lock);
 	pool->front_len = front_len;
-	INIT_LIST_HEAD(&pool->msgs);
-	pool->num = 0;
-	pool->min = min;
-	pool->blocking = blocking;
-	init_waitqueue_head(&pool->wait);
-
-	spin_lock(&pool->lock);
-	ret = __fill_msgpool(pool);
-	spin_unlock(&pool->lock);
-	return ret;
+	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+	if (!pool->pool)
+		return -ENOMEM;
+	pool->name = name;
+	return 0;
 }
 
 void ceph_msgpool_destroy(struct ceph_msgpool *pool)
 {
-	dout("msgpool_destroy %p\n", pool);
-	spin_lock(&pool->lock);
-	pool->min = 0;
-	__fill_msgpool(pool);
-	spin_unlock(&pool->lock);
+	mempool_destroy(pool->pool);
 }
 
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+				  int front_len)
 {
-	int ret;
-
-	spin_lock(&pool->lock);
-	dout("msgpool_resv %p delta %d\n", pool, delta);
-	pool->min += delta;
-	ret = __fill_msgpool(pool);
-	spin_unlock(&pool->lock);
-	return ret;
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-	wait_queue_t wait;
-	struct ceph_msg *msg;
-
-	if (front_len && front_len > pool->front_len) {
-		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-		       pool, front_len, pool->front_len);
+	if (front_len > pool->front_len) {
+		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+		       pool->name, front_len, pool->front_len);
 		WARN_ON(1);
 
 		/* try to alloc a fresh message */
-		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-		if (!IS_ERR(msg))
-			return msg;
-	}
-
-	if (!front_len)
-		front_len = pool->front_len;
-
-	if (pool->blocking) {
-		/* mempool_t behavior; first try to alloc */
-		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-		if (!IS_ERR(msg))
-			return msg;
+		return ceph_msg_new(0, front_len, GFP_NOFS);
 	}
 
-	while (1) {
-		spin_lock(&pool->lock);
-		if (likely(pool->num)) {
-			msg = list_entry(pool->msgs.next, struct ceph_msg,
-					 list_head);
-			list_del_init(&msg->list_head);
-			pool->num--;
-			dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-			     pool->num, pool->min);
-			spin_unlock(&pool->lock);
-			return msg;
-		}
-		pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-		       pool->min, pool->blocking ? "waiting" : "may fail");
-		spin_unlock(&pool->lock);
-
-		if (!pool->blocking) {
-			WARN_ON(1);
-
-			/* maybe we can allocate it now? */
-			msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-			if (!IS_ERR(msg))
-				return msg;
-
-			pr_err("msgpool_get %p empty + alloc failed\n", pool);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		init_wait(&wait);
-		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-		schedule();
-		finish_wait(&pool->wait, &wait);
-	}
+	return mempool_alloc(pool->pool, GFP_NOFS);
 }
 
 void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
-	spin_lock(&pool->lock);
-	if (pool->num < pool->min) {
-		/* reset msg front_len; user may have changed it */
-		msg->front.iov_len = pool->front_len;
-		msg->hdr.front_len = cpu_to_le32(pool->front_len);
+	/* reset msg front_len; user may have changed it */
+	msg->front.iov_len = pool->front_len;
+	msg->hdr.front_len = cpu_to_le32(pool->front_len);
 
-		kref_set(&msg->kref, 1);  /* retake a single ref */
-		list_add(&msg->list_head, &pool->msgs);
-		pool->num++;
-		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-		     pool->num, pool->min);
-		spin_unlock(&pool->lock);
-		wake_up(&pool->wait);
-	} else {
-		dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-		     pool->num, pool->min);
-		spin_unlock(&pool->lock);
-		ceph_msg_kfree(msg);
-	}
+	kref_init(&msg->kref);  /* retake single ref */
 }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd72..a362605f936 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MSGPOOL
 #define _FS_CEPH_MSGPOOL
 
+#include <linux/mempool.h>
 #include "messenger.h"
 
 /*
@@ -8,18 +9,15 @@
  * avoid unexpected OOM conditions.
  */
 struct ceph_msgpool {
-	spinlock_t lock;
+	const char *name;
+	mempool_t *pool;
 	int front_len;          /* preallocated payload size */
-	struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-	int num, min;           /* cur, min # msgs in the pool */
-	bool blocking;
-	wait_queue_head_t wait;
 };
 
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-			     int front_len, int size, bool blocking);
+			     int front_len, int size, bool blocking,
+			     const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
 					 int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f..892a0298dfd 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
 #define CEPH_ENTITY_TYPE_MDS    0x02
 #define CEPH_ENTITY_TYPE_OSD    0x04
 #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
 #define CEPH_ENTITY_TYPE_AUTH   0x20
 
 #define CEPH_ENTITY_TYPE_ANY    0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
 /*
  * message header
  */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
 	__le64 seq;       /* message seq# for this session */
 	__le64 tid;       /* transaction id */
 	__le16 type;      /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
 	__le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
 
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_name src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85..afa7bb3895c 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
 #define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
 
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
 static int __kick_requests(struct ceph_osd_client *osdc,
 			  struct ceph_osd *kickosd);
 
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		req = kzalloc(sizeof(*req), GFP_NOFS);
 	}
 	if (req == NULL)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
-	if (IS_ERR(msg)) {
+				   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+	if (!msg) {
 		ceph_osdc_put_request(req);
-		return ERR_PTR(PTR_ERR(msg));
+		return NULL;
 	}
 	req->r_reply = msg;
 
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
-	if (IS_ERR(msg)) {
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+	if (!msg) {
 		ceph_osdc_put_request(req);
-		return ERR_PTR(PTR_ERR(msg));
+		return NULL;
 	}
 	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
 	 * should mark the osd as failed and we should find out about
 	 * it from an updated osd map.
 	 */
-	while (!list_empty(&osdc->req_lru)) {
+	while (timeout && !list_empty(&osdc->req_lru)) {
 		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
 				 r_req_lru_item);
 
@@ -1078,6 +1078,7 @@ done:
 	if (newmap)
 		kick_requests(osdc, NULL);
 	up_read(&osdc->map_sem);
+	wake_up(&osdc->client->auth_wq);
 	return;
 
 bad:
@@ -1087,45 +1088,6 @@ bad:
 	return;
 }
 
-
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-			 struct ceph_msg_header *hdr,
-			 struct ceph_osd_request *req,
-			 u64 tid,
-			 struct ceph_msg *m)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-	int ret = -1;
-	int data_len = le32_to_cpu(hdr->data_len);
-	unsigned data_off = le16_to_cpu(hdr->data_off);
-
-	int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
-	if (!osd)
-		return -1;
-
-	osdc = osd->o_osdc;
-
-	dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-	     tid, req->r_num_pages, want);
-	if (unlikely(req->r_num_pages < want))
-		goto out;
-	m->pages = req->r_pages;
-	m->nr_pages = req->r_num_pages;
-	ret = 0; /* success */
-out:
-	BUG_ON(ret < 0 || m->nr_pages < want);
-
-	return ret;
-}
-
 /*
  * Register request, send initial attempt.
  */
@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->req_mempool)
 		goto out;
 
-	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+				"osd_op");
 	if (err < 0)
 		goto out_mempool;
 	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-				OSD_OPREPLY_FRONT_LEN, 10, true);
+				OSD_OPREPLY_FRONT_LEN, 10, true,
+				"osd_op_reply");
 	if (err < 0)
 		goto out_msgpool;
 	return 0;
@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
 				    false, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
 
 	/* it may be a short read due to an object boundary */
 	req->r_pages = pages;
@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    snapc, do_sync,
 				    truncate_seq, truncate_size, mtime,
 				    nofail, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (!req)
+		return -ENOMEM;
 
 	/* it may be a short write due to an object boundary */
 	req->r_pages = pages;
@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 }
 
 /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
  */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
 				  struct ceph_msg_header *hdr,
@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	int front = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
 	u64 tid;
-	int err;
 
 	tid = le64_to_cpu(hdr->tid);
 	mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		     req->r_reply, req->r_con_filling_msg);
 		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
 		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
 	}
 
 	if (front > req->r_reply->front.iov_len) {
 		pr_warning("get_reply front %d > preallocated %d\n",
 			   front, (int)req->r_reply->front.iov_len);
-		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
-		if (IS_ERR(m))
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+		if (!m)
 			goto out;
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		err = __prepare_pages(con, hdr, req, tid, m);
-		if (err < 0) {
+		unsigned data_off = le16_to_cpu(hdr->data_off);
+		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+		if (unlikely(req->r_num_pages < want)) {
+			pr_warning("tid %lld reply %d > expected %d pages\n",
+				   tid, want, m->nr_pages);
 			*skip = 1;
 			ceph_msg_put(m);
-			m = ERR_PTR(err);
+			m = NULL;
+			goto out;
 		}
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
 	}
 	*skip = 0;
 	req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
-		return ceph_msg_new(type, front, 0, 0, NULL);
+		return ceph_msg_new(type, front, GFP_NOFS);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
 	.get = get_osd_con,
 	.put = put_osd_con,
 	.dispatch = dispatch,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745..b6859f47d36 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
 
 static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 {
-	struct page *page = alloc_page(GFP_NOFS);
+	struct page *page = __page_cache_alloc(GFP_NOFS);
 	if (!page)
 		return -ENOMEM;
 	pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871..8fcc023056c 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
 	__le64 snap_seq;          /* seq for per-pool snapshot */
 	__le32 snap_epoch;        /* epoch of last snap */
 	__le32 num_snaps;
-	__le32 num_removed_snap_intervals;
-	__le64 uid;
+	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+	__le64 auid;               /* who owns the pg */
 } __attribute__ ((packed));
 
 /*
@@ -208,6 +208,7 @@ enum {
 	/* read */
 	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
 	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
 
 	/* write */
 	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
 #define EBLACKLISTED ESHUTDOWN /* blacklisted */
 
+/* xattr comparison */
+enum {
+	CEPH_OSD_CMPXATTR_OP_NOP = 0,
+	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+	CEPH_OSD_CMPXATTR_OP_NE  = 2,
+	CEPH_OSD_CMPXATTR_OP_GT  = 3,
+	CEPH_OSD_CMPXATTR_OP_GTE = 4,
+	CEPH_OSD_CMPXATTR_OP_LT  = 5,
+	CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+	CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
 /*
  * an individual object operation.  each may be accompanied by some data
  * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
 		struct {
 			__le32 name_len;
 			__le32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
 		} __attribute__ ((packed)) xattr;
 		struct {
 			__u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db7045..c0b26b6badb 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
 {
 	struct inode *inode = &ci->vfs_inode;
-	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9307bbee6fb..7c663d9b9f8 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
 
 #include "decode.h"
 #include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_syncfs(struct super_block *sb, int wait)
 {
 	dout("sync_fs %d\n", wait);
-	ceph_osdc_sync(&ceph_client(sb)->osdc);
-	ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+	ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+	ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
 	dout("sync_fs %d done\n", wait);
 	return 0;
 }
 
+static int default_congestion_kb(void)
+{
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
+}
 
 /**
  * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",nocrc");
 	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
 		seq_puts(m, ",noasyncreaddir");
+
+	if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+	if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+	if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+	if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			 args->osd_keepalive_timeout);
+	if (args->wsize)
+		seq_printf(m, ",wsize=%d", args->wsize);
+	if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", args->rsize);
+	if (args->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+	if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 args->caps_wanted_delay_min);
+	if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   args->caps_wanted_delay_max);
+	if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   args->cap_release_safety);
+	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+	if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
 	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
 	if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
 	inode_init_once(&ci->vfs_inode);
 }
 
-static int default_congestion_kb(void)
-{
-	int congestion_kb;
-
-	/*
-	 * Copied from NFS
-	 *
-	 * congestion size, scale with available memory.
-	 *
-	 *  64MB:    8192k
-	 * 128MB:   11585k
-	 * 256MB:   16384k
-	 * 512MB:   23170k
-	 *   1GB:   32768k
-	 *   2GB:   46340k
-	 *   4GB:   65536k
-	 *   8GB:   92681k
-	 *  16GB:  131072k
-	 *
-	 * This allows larger machines to have larger/more transfers.
-	 * Limit the default to 256M
-	 */
-	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-	if (congestion_kb > 256*1024)
-		congestion_kb = 256*1024;
-
-	return congestion_kb;
-}
-
 static int __init init_caches(void)
 {
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
 	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
+	Opt_cap_release_safety,
 	Opt_readdir_max_entries,
+	Opt_readdir_max_bytes,
 	Opt_congestion_kb,
 	Opt_last_int,
 	/* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_cap_release_safety, "cap_release_safety=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
 	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
-	args->max_readdir = 1024;
+	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+	args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 	args->congestion_kb = default_congestion_kb();
 
 	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 		case Opt_readdir_max_entries:
 			args->max_readdir = intval;
 			break;
+		case Opt_readdir_max_bytes:
+			args->max_readdir_bytes = intval;
+			break;
 		case Opt_congestion_kb:
 			args->congestion_kb = intval;
 			break;
@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
 {
-	return client->monc.monmap && client->monc.monmap->epoch;
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 
 /*
@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 	if (err < 0)
 		goto out;
 
-	while (!have_mon_map(client)) {
+	while (!have_mon_and_osd_map(client)) {
 		err = -EIO;
 		if (timeout && time_after_eq(jiffies, started + timeout))
 			goto out;
@@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 		/* wait */
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->auth_wq,
-			       have_mon_map(client) || (client->auth_err < 0),
-			       timeout);
+		       have_mon_and_osd_map(client) || (client->auth_err < 0),
+		       timeout);
 		if (err == -EINTR || err == -ERESTARTSYS)
 			goto out;
 		if (client->auth_err < 0) {
@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
  * construct our own bdi so we can control readahead, etc.
  */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
 	int err;
@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 		client->backing_dev_info.ra_pages =
 			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+	err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
 		sb->s_bdi = &client->backing_dev_info;
 	return err;
@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		goto out;
 	}
 
-	if (ceph_client(sb) != client) {
+	if (ceph_sb_to_client(sb) != client) {
 		ceph_destroy_client(client);
-		client = ceph_client(sb);
+		client = ceph_sb_to_client(sb);
 		dout("get_sb got existing client %p\n", client);
 	} else {
 		dout("get_sb using new client %p\n", client);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87..3725c9ee9d0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,25 @@
 
 struct ceph_mount_args {
 	int sb_flags;
+	int flags;
+	struct ceph_fsid fsid;
+	struct ceph_entity_addr my_addr;
 	int num_mon;
 	struct ceph_entity_addr *mon_addr;
-	int flags;
 	int mount_timeout;
 	int osd_idle_ttl;
-	int caps_wanted_delay_min, caps_wanted_delay_max;
-	struct ceph_fsid fsid;
-	struct ceph_entity_addr my_addr;
-	int wsize;
-	int rsize;            /* max readahead */
-	int max_readdir;      /* max readdir size */
-	int congestion_kb;      /* max readdir size */
 	int osd_timeout;
 	int osd_keepalive_timeout;
+	int wsize;
+	int rsize;            /* max readahead */
+	int congestion_kb;    /* max writeback in flight */
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int cap_release_safety;
+	int max_readdir;       /* max readdir result (entires) */
+	int max_readdir_bytes; /* max readdir result (bytes) */
 	char *snapdir_name;   /* default ".snap" */
 	char *name;
 	char *secret;
-	int cap_release_safety;
 };
 
 /*
@@ -80,13 +81,14 @@ struct ceph_mount_args {
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
 
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
-
 /*
  * Delay telling the MDS we no longer want caps, in case we reopen
  * the file.  Delay a minimum amount of time, even if we send a cap
@@ -96,6 +98,7 @@ struct ceph_mount_args {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
 
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 
 /* mount state */
 enum {
@@ -160,12 +163,6 @@ struct ceph_client {
 #endif
 };
 
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-
 /*
  * File i/o capability.  This tracks shared state with the metadata
  * server that allows us to cache or writeback attributes or to read
@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 
 /*
  * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907f..68aeebc6968 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
 
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_SECURITY_PREFIX,
+	return !strncmp(name, "ceph.", 5) ||
+	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-	{ true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
-	{ true, "user.ceph.dir.files", ceph_vxattrcb_files},
-	{ true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-	{ true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
-	{ true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-	{ true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-	{ true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-	{ true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
+	{ true, "ceph.dir.files", ceph_vxattrcb_files},
+	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
 	{ true, NULL, NULL }
 };
 
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-	{ true, "user.ceph.layout", ceph_vxattrcb_layout},
+	{ true, "ceph.layout", ceph_vxattrcb_layout},
 	{ NULL, NULL }
 };
 
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		ci->i_xattrs.names_size -= xattr->name_len;
 		ci->i_xattrs.vals_size -= xattr->val_len;
 	}
-	if (!xattr) {
-		pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-		       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-		       xattr->val);
-		return -ENOMEM;
-	}
 	ci->i_xattrs.names_size += name_len;
 	ci->i_xattrs.vals_size += val_len;
 	if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-	    (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto list_xattr;
 	} else {
 		spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			      const char *value, size_t size, int flags)
 {
-	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			return -ENOMEM;
 		err = -ENOMEM;
 		for (i = 0; i < nr_pages; i++) {
-			pages[i] = alloc_page(GFP_NOFS);
+			pages[i] = __page_cache_alloc(GFP_NOFS);
 			if (!pages[i]) {
 				nr_pages = i;
 				goto out;
@@ -779,7 +774,7 @@ out:
 
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
 	struct ceph_mds_client *mdsc = &client->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa0..ca25d96d45c 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
 /*
  * Pioctl operations for Coda.
- * Original version: (C) 1996 Peter Braam 
+ * Original version: (C) 1996 Peter Braam
  * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
  *
  * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include <linux/smp_lock.h>
+
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
-static int coda_pioctl(struct inode * inode, struct file * filp, 
-                       unsigned int cmd, unsigned long user_data);
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data);
 
 /* exported from this file */
-const struct inode_operations coda_ioctl_inode_operations =
-{
+const struct inode_operations coda_ioctl_inode_operations = {
 	.permission	= coda_ioctl_permission,
 	.setattr	= coda_setattr,
 };
 
 const struct file_operations coda_ioctl_operations = {
 	.owner		= THIS_MODULE,
-	.ioctl		= coda_pioctl,
+	.unlocked_ioctl	= coda_pioctl,
 };
 
 /* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
-static int coda_pioctl(struct inode * inode, struct file * filp, 
-                       unsigned int cmd, unsigned long user_data)
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data)
 {
 	struct path path;
-        int error;
+	int error;
 	struct PioctlData data;
-        struct inode *target_inode = NULL;
-        struct coda_inode_info *cnp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *target_inode = NULL;
+	struct coda_inode_info *cnp;
 
-        /* get the Pioctl data arguments from user space */
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-	    return -EINVAL;
-	}
-       
-        /* 
-         * Look up the pathname. Note that the pathname is in 
-         * user memory, and namei takes care of this
-         */
-        if (data.follow) {
-                error = user_path(data.path, &path);
-	} else {
-	        error = user_lpath(data.path, &path);
+	lock_kernel();
+
+	/* get the Pioctl data arguments from user space */
+	if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
+		error = -EINVAL;
+		goto out;
 	}
-		
-	if ( error ) {
-		return error;
-        } else {
+
+	/*
+	 * Look up the pathname. Note that the pathname is in
+	 * user memory, and namei takes care of this
+	 */
+	if (data.follow)
+		error = user_path(data.path, &path);
+	else
+		error = user_lpath(data.path, &path);
+
+	if (error)
+		goto out;
+	else
 		target_inode = path.dentry->d_inode;
-	}
-	
+
 	/* return if it is not a Coda inode */
-	if ( target_inode->i_sb != inode->i_sb ) {
+	if (target_inode->i_sb != inode->i_sb) {
 		path_put(&path);
-	        return  -EINVAL;
+		error = -EINVAL;
+		goto out;
 	}
 
 	/* now proceed to make the upcall */
-        cnp = ITOC(target_inode);
+	cnp = ITOC(target_inode);
 
 	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
 
 	path_put(&path);
-        return error;
-}
 
+out:
+	unlock_kernel();
+	return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca209..66b9cf79c5b 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
 	return mask;
 }
 
-static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 
-			    unsigned int cmd, unsigned long arg)
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
 {
 	unsigned int data;
 
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
 	.read		= coda_psdev_read,
 	.write		= coda_psdev_write,
 	.poll		= coda_psdev_poll,
-	.ioctl		= coda_psdev_ioctl,
+	.unlocked_ioctl	= coda_psdev_ioctl,
 	.open		= coda_psdev_open,
 	.release	= coda_psdev_release,
 };
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f84..f0b391c5055 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
 	return ret;
 }
 
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t compat_rw_copy_check_uvector(int type,
+		const struct compat_iovec __user *uvector, unsigned long nr_segs,
+		unsigned long fast_segs, struct iovec *fast_pointer,
+		struct iovec **ret_pointer)
+{
+	compat_ssize_t tot_len;
+	struct iovec *iov = *ret_pointer = fast_pointer;
+	ssize_t ret = 0;
+	int seg;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+		goto out;
+	if (nr_segs > fast_segs) {
+		ret = -ENOMEM;
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL) {
+			*ret_pointer = fast_pointer;
+			goto out;
+		}
+	}
+	*ret_pointer = iov;
+
+	/*
+	 * Single unix specification:
+	 * We should -EINVAL if an element length is not >= 0 and fitting an
+	 * ssize_t.  The total length is fitting an ssize_t
+	 *
+	 * Be careful here because iov_len is a size_t not an ssize_t
+	 */
+	tot_len = 0;
+	ret = -EINVAL;
+	for (seg = 0; seg < nr_segs; seg++) {
+		compat_ssize_t tmp = tot_len;
+		compat_uptr_t buf;
+		compat_ssize_t len;
+
+		if (__get_user(len, &uvector->iov_len) ||
+		   __get_user(buf, &uvector->iov_base)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
+			goto out;
+		tot_len += len;
+		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+			goto out;
+		if (!access_ok(vrfy_dir(type), buf, len)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		iov->iov_base = compat_ptr(buf);
+		iov->iov_len = (compat_size_t) len;
+		uvector++;
+		iov++;
+	}
+	ret = tot_len;
+
+out:
+	return ret;
+}
+
 static inline long
 copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 {
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
 	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
 	ret = copy_iocb(nr, iocb, iocb64);
 	if (!ret)
-		ret = sys_io_submit(ctx_id, nr, iocb64);
+		ret = do_io_submit(ctx_id, nr, iocb64, 1);
 	return ret;
 }
 
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
 	compat_ssize_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov=iovstack, *vector;
+	struct iovec *iov;
 	ssize_t ret;
-	int seg;
 	io_fn_t fn;
 	iov_fn_t fnv;
 
-	/*
-	 * SuS says "The readv() function *may* fail if the iovcnt argument
-	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-	 * traditionally returned zero for zero segments, so...
-	 */
-	ret = 0;
-	if (nr_segs == 0)
-		goto out;
-
-	/*
-	 * First get the "struct iovec" from user memory and
-	 * verify all the pointers
-	 */
 	ret = -EINVAL;
-	if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
-		goto out;
 	if (!file->f_op)
 		goto out;
-	if (nr_segs > UIO_FASTIOV) {
-		ret = -ENOMEM;
-		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-		if (!iov)
-			goto out;
-	}
+
 	ret = -EFAULT;
 	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 		goto out;
 
-	/*
-	 * Single unix specification:
-	 * We should -EINVAL if an element length is not >= 0 and fitting an
-	 * ssize_t.  The total length is fitting an ssize_t
-	 *
-	 * Be careful here because iov_len is a size_t not an ssize_t
-	 */
-	tot_len = 0;
-	vector = iov;
-	ret = -EINVAL;
-	for (seg = 0 ; seg < nr_segs; seg++) {
-		compat_ssize_t tmp = tot_len;
-		compat_ssize_t len;
-		compat_uptr_t buf;
-
-		if (__get_user(len, &uvector->iov_len) ||
-		    __get_user(buf, &uvector->iov_base)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		if (len < 0)	/* size_t not fitting an compat_ssize_t .. */
-			goto out;
-		tot_len += len;
-		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-			goto out;
-		vector->iov_base = compat_ptr(buf);
-		vector->iov_len = (compat_size_t) len;
-		uvector++;
-		vector++;
-	}
+	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+					       UIO_FASTIOV, iovstack, &iov);
 	if (tot_len == 0) {
 		ret = 0;
 		goto out;
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2..e19de6a8033 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	 * use STACK_TOP because that can depend on attributes which aren't
 	 * configured yet.
 	 */
+	BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-	vma->vm_flags = VM_STACK_FLAGS;
+	vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	else if (executable_stack == EXSTACK_DISABLE_X)
 		vm_flags &= ~VM_EXEC;
 	vm_flags |= mm->def_flags;
+	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 
 	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 			vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
 			goto out_unlock;
 	}
 
+	/* mprotect_fixup is overkill to remove the temporary stack flags */
+	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
+
 	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
 	stack_size = vma->vm_end - vma->vm_start;
 	/*
@@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
 	struct signal_struct *sig = tsk->signal;
 	struct sighand_struct *oldsighand = tsk->sighand;
 	spinlock_t *lock = &oldsighand->siglock;
-	int count;
 
 	if (thread_group_empty(tsk))
 		goto no_thread_group;
@@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
 		spin_unlock_irq(lock);
 		return -EAGAIN;
 	}
+
 	sig->group_exit_task = tsk;
-	zap_other_threads(tsk);
+	sig->notify_count = zap_other_threads(tsk);
+	if (!thread_group_leader(tsk))
+		sig->notify_count--;
 
-	/* Account for the thread group leader hanging around: */
-	count = thread_group_leader(tsk) ? 1 : 2;
-	sig->notify_count = count;
-	while (atomic_read(&sig->count) > count) {
+	while (sig->notify_count) {
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(lock);
 		schedule();
@@ -1657,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	struct completion *vfork_done;
-	int core_waiters;
+	int core_waiters = -EBUSY;
 
 	init_completion(&core_state->startup);
 	core_state->dumper.task = tsk;
 	core_state->dumper.next = NULL;
-	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+
+	down_write(&mm->mmap_sem);
+	if (!mm->core_state)
+		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(core_waiters < 0))
@@ -1782,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
 }
 
 
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+	struct file *rp, *wp;
+	struct fdtable *fdt;
+	struct coredump_params *cp = (struct coredump_params *)info->data;
+	struct files_struct *cf = current->files;
+
+	wp = create_write_pipe(0);
+	if (IS_ERR(wp))
+		return PTR_ERR(wp);
+
+	rp = create_read_pipe(wp, 0);
+	if (IS_ERR(rp)) {
+		free_write_pipe(wp);
+		return PTR_ERR(rp);
+	}
+
+	cp->file = wp;
+
+	sys_close(0);
+	fd_install(0, rp);
+	spin_lock(&cf->file_lock);
+	fdt = files_fdtable(cf);
+	FD_SET(0, fdt->open_fds);
+	FD_CLR(0, fdt->close_on_exec);
+	spin_unlock(&cf->file_lock);
+
+	/* and disallow core files too */
+	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+	return 0;
+}
+
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
 	char corename[CORENAME_MAX_SIZE + 1];
 	struct mm_struct *mm = current->mm;
 	struct linux_binfmt * binfmt;
-	struct inode * inode;
 	const struct cred *old_cred;
 	struct cred *cred;
 	int retval = 0;
 	int flag = 0;
-	int ispipe = 0;
-	char **helper_argv = NULL;
-	int helper_argc = 0;
-	int dump_count = 0;
+	int ispipe;
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
 		.signr = signr,
@@ -1815,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	binfmt = mm->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
-
-	cred = prepare_creds();
-	if (!cred) {
-		retval = -ENOMEM;
+	if (!__get_dumpable(cprm.mm_flags))
 		goto fail;
-	}
 
-	down_write(&mm->mmap_sem);
-	/*
-	 * If another thread got here first, or we are not dumpable, bail out.
-	 */
-	if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
-		up_write(&mm->mmap_sem);
-		put_cred(cred);
+	cred = prepare_creds();
+	if (!cred)
 		goto fail;
-	}
-
 	/*
 	 *	We cannot trust fsuid as being the "true" uid of the
 	 *	process nor do we know its entire history. We only know it
@@ -1844,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
-	if (retval < 0) {
-		put_cred(cred);
-		goto fail;
-	}
+	if (retval < 0)
+		goto fail_creds;
 
 	old_cred = override_creds(cred);
 
@@ -1865,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	ispipe = format_corename(corename, signr);
 	unlock_kernel();
 
-	if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
-		goto fail_unlock;
-
  	if (ispipe) {
-		if (cprm.limit == 0) {
+		int dump_count;
+		char **helper_argv;
+
+		if (cprm.limit == 1) {
 			/*
 			 * Normally core limits are irrelevant to pipes, since
 			 * we're not writing to the file system, but we use
-			 * cprm.limit of 0 here as a speacial value. Any
-			 * non-zero limit gets set to RLIM_INFINITY below, but
+			 * cprm.limit of 1 here as a speacial value. Any
+			 * non-1 limit gets set to RLIM_INFINITY below, but
 			 * a limit of 0 skips the dump.  This is a consistent
 			 * way to catch recursive crashes.  We can still crash
-			 * if the core_pattern binary sets RLIM_CORE =  !0
+			 * if the core_pattern binary sets RLIM_CORE =  !1
 			 * but it runs as root, and can do lots of stupid things
 			 * Note that we use task_tgid_vnr here to grab the pid
 			 * of the process group leader.  That way we get the
@@ -1885,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			 * core_pattern process dies.
 			 */
 			printk(KERN_WARNING
-				"Process %d(%s) has RLIMIT_CORE set to 0\n",
+				"Process %d(%s) has RLIMIT_CORE set to 1\n",
 				task_tgid_vnr(current), current->comm);
 			printk(KERN_WARNING "Aborting core\n");
 			goto fail_unlock;
 		}
+		cprm.limit = RLIM_INFINITY;
 
 		dump_count = atomic_inc_return(&core_dump_count);
 		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1899,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+		helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
 			goto fail_dropcount;
 		}
 
-		cprm.limit = RLIM_INFINITY;
-
-		/* SIGPIPE can happen, but it's just never processed */
-		if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
-				&cprm.file)) {
+		retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+					NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+					NULL, &cprm);
+		argv_free(helper_argv);
+		if (retval) {
  			printk(KERN_INFO "Core dump to %s pipe failed\n",
 			       corename);
-			goto fail_dropcount;
+			goto close_fail;
  		}
- 	} else
+	} else {
+		struct inode *inode;
+
+		if (cprm.limit < binfmt->min_coredump)
+			goto fail_unlock;
+
 		cprm.file = filp_open(corename,
 				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
 				 0600);
-	if (IS_ERR(cprm.file))
-		goto fail_dropcount;
-	inode = cprm.file->f_path.dentry->d_inode;
-	if (inode->i_nlink > 1)
-		goto close_fail;	/* multiple links - don't dump */
-	if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
-		goto close_fail;
-
-	/* AK: actually i see no reason to not allow this for named pipes etc.,
-	   but keep the previous behaviour for now. */
-	if (!ispipe && !S_ISREG(inode->i_mode))
-		goto close_fail;
-	/*
-	 * Dont allow local users get cute and trick others to coredump
-	 * into their pre-created files:
-	 * Note, this is not relevant for pipes
-	 */
-	if (!ispipe && (inode->i_uid != current_fsuid()))
-		goto close_fail;
-	if (!cprm.file->f_op)
-		goto close_fail;
-	if (!cprm.file->f_op->write)
-		goto close_fail;
-	if (!ispipe &&
-	    do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
-		goto close_fail;
+		if (IS_ERR(cprm.file))
+			goto fail_unlock;
 
-	retval = binfmt->core_dump(&cprm);
+		inode = cprm.file->f_path.dentry->d_inode;
+		if (inode->i_nlink > 1)
+			goto close_fail;
+		if (d_unhashed(cprm.file->f_path.dentry))
+			goto close_fail;
+		/*
+		 * AK: actually i see no reason to not allow this for named
+		 * pipes etc, but keep the previous behaviour for now.
+		 */
+		if (!S_ISREG(inode->i_mode))
+			goto close_fail;
+		/*
+		 * Dont allow local users get cute and trick others to coredump
+		 * into their pre-created files.
+		 */
+		if (inode->i_uid != current_fsuid())
+			goto close_fail;
+		if (!cprm.file->f_op || !cprm.file->f_op->write)
+			goto close_fail;
+		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+			goto close_fail;
+	}
 
+	retval = binfmt->core_dump(&cprm);
 	if (retval)
 		current->signal->group_exit_code |= 0x80;
-close_fail:
+
 	if (ispipe && core_pipe_limit)
 		wait_for_dump_helpers(cprm.file);
-	filp_close(cprm.file, NULL);
+close_fail:
+	if (cprm.file)
+		filp_close(cprm.file, NULL);
 fail_dropcount:
-	if (dump_count)
+	if (ispipe)
 		atomic_dec(&core_dump_count);
 fail_unlock:
-	if (helper_argv)
-		argv_free(helper_argv);
-
+	coredump_finish(mm);
 	revert_creds(old_cred);
+fail_creds:
 	put_cred(cred);
-	coredump_finish(mm);
 fail:
 	return;
 }
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c..d91e9d829bc 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
 	de->inode_no = cpu_to_le64(parent->i_ino);
 	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
 	exofs_set_de_type(de, inode);
-	kunmap_atomic(page, KM_USER0);
+	kunmap_atomic(kaddr, KM_USER0);
 	err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
 	page_cache_release(page);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d7c6afa7975..4bb6ef822e4 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+	EXOFS_DBGMSG("page 0x%lx\n", page->index);
+	WARN_ON(1);
+	return try_to_free_buffers(page);
+}
+
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+	EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+	WARN_ON(1);
+
+	block_invalidatepage(page, offset);
+}
+
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
 	.writepages	= exofs_writepages,
 	.write_begin	= exofs_write_begin_export,
 	.write_end	= exofs_write_end,
+	.releasepage	= exofs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.invalidatepage = exofs_invalidatepage,
+
+	/* Not implemented Yet */
+	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+	.direct_IO	= NULL, /* TODO: Should be trivial to do */
+
+	/* With these NULL has special meaning or default is not exported */
+	.sync_page	= NULL,
+	.get_xip_mem	= NULL,
+	.migratepage	= NULL,
+	.launder_page	= NULL,
+	.is_partially_uptodate = NULL,
+	.error_remove_page = NULL,
 };
 
 /******************************************************************************
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c..95b7594c76f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	if (count)
 		*count = ar.len;
-
 	/*
-	 * Account for the allocated meta blocks
+	 * Account for the allocated meta blocks.  We will never
+	 * fail EDQUOT for metdata, but we do account for it.
 	 */
 	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		dquot_alloc_block_nofail(inode, ar.len);
 	}
 	return ret;
 }
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c4865508..5b6973fbf1b 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
 		else if (start_blk >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			if (start_blk + count > (entry->start_blk + 
+			if (start_blk + count > (entry->start_blk +
 						 entry->count))
-				entry->count = (start_blk + count - 
+				entry->count = (start_blk + count -
 						entry->start_blk);
 			new_node = *n;
 			new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a04..ea5e6cb7e2a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		__ext4_error(dir->i_sb, function,
-			"bad entry in directory #%lu: %s - block=%llu"
+		ext4_error_inode(function, dir,
+			"bad entry in directory: %s - block=%llu"
 			"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-			dir->i_ino, error_msg, 
-			(unsigned long long) bh->b_blocknr,     
+			error_msg, (unsigned long long) bh->b_blocknr,
 			(unsigned) (offset%bh->b_size), offset,
 			le32_to_cpu(de->inode),
 			rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
 
 	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-	    ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
 	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
 		err = ext4_dx_readdir(filp, dirent, filldir);
 		if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
 	}
 	stored = 0;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
-		ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-		struct buffer_head map_bh;
+		struct ext4_map_blocks map;
 		struct buffer_head *bh = NULL;
 
-		map_bh.b_state = 0;
-		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_len = 1;
+		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
-			pgoff_t index = map_bh.b_blocknr >>
+			pgoff_t index = map.m_pblk >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&filp->f_ra, index))
 				page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
 					&filp->f_ra, filp,
 					index, 1);
 			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-			bh = ext4_bread(NULL, inode, blk, 0, &err);
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
 		}
 
 		/*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				ext4_error(sb, "directory #%lu "
+				EXT4_ERROR_INODE(inode, "directory "
 					   "contains a hole at offset %Lu",
-					   inode->i_ino,
 					   (unsigned long long) filp->f_pos);
 				dir_has_error = 1;
 			}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f..60bd31026e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 
 /*
  * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
 #endif
 
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-	ext4_error_inode(__func__, (inode), (fmt), ## a);
+	ext4_error_inode(__func__, (inode), (fmt), ## a)
 
 #define EXT4_ERROR_FILE(file, fmt, a...)	\
-	ext4_error_file(__func__, (file), (fmt), ## a);
+	ext4_error_file(__func__, (file), (fmt), ## a)
 
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
  *
  * Also used to show what's going on for debugging purposes when the
  * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
 };
 
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW		(1 << BH_New)
+#define EXT4_MAP_MAPPED		(1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
+#define EXT4_MAP_UNINIT		(1 << BH_Uninit)
+#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+				 EXT4_MAP_UNINIT)
+
+struct ext4_map_blocks {
+	ext4_fsblk_t m_pblk;
+	ext4_lblk_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
  * For delayed allocation tracking
  */
 struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 		return flags & EXT4_OTHER_FLMASK;
 }
 
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
+	EXT4_INODE_UNRM		= 1,	/* Undelete */
+	EXT4_INODE_COMPR	= 2,	/* Compress file */
+	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
+	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
+	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
+	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
+	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
+/* Reserved for compression usage... */
+	EXT4_INODE_DIRTY	= 8,
+	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
+	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
+	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+/* End compression flags --- maybe not all used */
+	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
+	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
+	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
+	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
+	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
+	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
+	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
+	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
+	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
+	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+	printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+		EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+	CHECK_FLAG_VALUE(SECRM);
+	CHECK_FLAG_VALUE(UNRM);
+	CHECK_FLAG_VALUE(COMPR);
+	CHECK_FLAG_VALUE(SYNC);
+	CHECK_FLAG_VALUE(IMMUTABLE);
+	CHECK_FLAG_VALUE(APPEND);
+	CHECK_FLAG_VALUE(NODUMP);
+	CHECK_FLAG_VALUE(NOATIME);
+	CHECK_FLAG_VALUE(DIRTY);
+	CHECK_FLAG_VALUE(COMPRBLK);
+	CHECK_FLAG_VALUE(NOCOMPR);
+	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(INDEX);
+	CHECK_FLAG_VALUE(IMAGIC);
+	CHECK_FLAG_VALUE(JOURNAL_DATA);
+	CHECK_FLAG_VALUE(NOTAIL);
+	CHECK_FLAG_VALUE(DIRSYNC);
+	CHECK_FLAG_VALUE(TOPDIR);
+	CHECK_FLAG_VALUE(HUGE_FILE);
+	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(EA_INODE);
+	CHECK_FLAG_VALUE(EOFBLOCKS);
+	CHECK_FLAG_VALUE(RESERVED);
+}
+
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
 	__u32 group;		/* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
 	__u16 unused;
 };
 
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+	u32 group;
+	compat_u64 block_bitmap;
+	compat_u64 inode_bitmap;
+	compat_u64 inode_table;
+	u32 blocks_count;
+	u16 reserved_blocks;
+	u16 unused;
+};
+#endif
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Caller is from the delayed allocation writeout path,
-	   so set the magic i_delalloc_reserve_flag after taking the 
+	   so set the magic i_delalloc_reserve_flag after taking the
 	   inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
  */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+#endif
 
 
 /*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-	ext4_fsblk_t	i_file_acl;
 	__u32	i_dtime;
+	ext4_fsblk_t	i_file_acl;
 
 	/*
 	 * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
 	 */
 	ext4_group_t	i_block_group;
 	unsigned long	i_state_flags;		/* Dynamic state flags */
+	unsigned long	i_flags;
 
 	ext4_lblk_t		i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
 	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_NEWENTRY,		/* File just added to dir */
 };
 
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
-{
-	return test_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
-
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
-{
-	set_bit(bit, &EXT4_I(inode)->i_state_flags);
+#define EXT4_INODE_BIT_FNS(name, field)					\
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
+{									\
+	return test_bit(bit, &EXT4_I(inode)->i_##field);		\
+}									\
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
+{									\
+	set_bit(bit, &EXT4_I(inode)->i_##field);			\
+}									\
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{									\
+	clear_bit(bit, &EXT4_I(inode)->i_##field);			\
 }
 
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
-{
-	clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
 
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
 				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-		      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
 
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t iblock, unsigned int max_blocks,
-			       struct buffer_head *bh_result, int flags);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map, int flags);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
 			   struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad512646..dade0c02479 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
 		return 1;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 		return 1;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 1;
 	return 0;
 }
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
 		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 		return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
 		return 0;
 	if (EXT4_JOURNAL(inode) == NULL)
 		return 1;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
 		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return 0;
 	if (ext4_should_journal_data(inode))
 		return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca..377309c1af6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
 	if (err <= 0)
 		return err;
 	err = ext4_truncate_restart_trans(handle, inode, needed);
-	/*
-	 * We have dropped i_data_sem so someone might have cached again
-	 * an extent we are going to truncate.
-	 */
-	ext4_ext_invalidate_cache(inode);
+	if (err == 0)
+		err = -EAGAIN;
 
 	return err;
 }
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 		/*
 		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-		 * block groups per flexgroup, reserve the first block 
-		 * group for directories and special files.  Regular 
+		 * block groups per flexgroup, reserve the first block
+		 * group for directories and special files.  Regular
 		 * files will start at the second block group.  This
-		 * tends to speed up directory access and improves 
+		 * tends to speed up directory access and improves
 		 * fsck times.
 		 */
 		block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 	return 0;
 
 corrupted:
-	__ext4_error(inode->i_sb, function,
-			"bad header/extent in inode #%lu: %s - magic %x, "
+	ext4_error_inode(function, inode,
+			"bad header/extent: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
-			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+			error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 			max, le16_to_cpu(eh->eh_depth), depth);
 
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
 		merge_done = 1;
 		WARN_ON(eh->eh_entries == 0);
 		if (!eh->eh_entries)
-			ext4_error(inode->i_sb,
-				   "inode#%lu, eh->eh_entries = 0!",
-				   inode->i_ino);
+			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
 	}
 
 	return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 	struct ext4_ext_cache *cex;
 	int ret = EXT4_EXT_CACHE_NO;
 
-	/* 
+	/*
 	 * We borrow i_block_reservation_lock to protect i_cached_extent
 	 */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path;
 	handle_t *handle;
-	int i = 0, err = 0;
+	int i, err;
 
 	ext_debug("truncate since %u\n", start);
 
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
+again:
 	ext4_ext_invalidate_cache(inode);
 
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
+	depth = ext_depth(inode);
 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
 	if (path == NULL) {
 		ext4_journal_stop(handle);
 		return -ENOMEM;
 	}
+	path[0].p_depth = depth;
 	path[0].p_hdr = ext_inode_hdr(inode);
 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
-	path[0].p_depth = depth;
+	i = err = 0;
 
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
 	ext4_ext_drop_refs(path);
 	kfree(path);
+	if (err == -EAGAIN)
+		goto again;
 	ext4_journal_stop(handle);
 
 	return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-	int ret = -EIO;
+	int ret;
 	struct bio *bio;
 	int blkbits, blocksize;
 	sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 			len = ee_len;
 
 		bio = bio_alloc(GFP_NOIO, len);
+		if (!bio)
+			return -ENOMEM;
+
 		bio->bi_sector = ee_pblock;
 		bio->bi_bdev   = inode->i_sb->s_bdev;
 
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 		submit_bio(WRITE, bio);
 		wait_for_completion(&event);
 
-		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-			ret = 0;
-		else {
-			ret = -EIO;
-			break;
+		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+			bio_put(bio);
+			return -EIO;
 		}
 		bio_put(bio);
 		ee_len    -= done;
 		ee_pblock += done  << (blkbits - 9);
 	}
-	return ret;
+	return 0;
 }
 
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
  * extent into multiple extents (upto three - one initialized and two
  * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  *   c> Splits in three extents: Somone is writing in middle of the extent
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-						struct inode *inode,
-						struct ext4_ext_path *path,
-						ext4_lblk_t iblock,
-						unsigned int max_blocks)
+					   struct inode *inode,
+					   struct ext4_map_blocks *map,
+					   struct ext4_ext_path *path)
 {
 	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex1 = NULL;
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
 	int ret = 0;
+	int may_zeroout;
+
+	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
 	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		return allocated;
 	}
 
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
 	 * overlap of blocks.
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
 	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unsigned int newdepth;
 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-		if (allocated <= EXT4_EXT_ZERO_LEN) {
+		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
 			/*
-			 * iblock == ee_block is handled by the zerouout
+			 * map->m_lblk == ee_block is handled by the zerouout
 			 * at the beginning.
 			 * Mark first half uninitialized.
 			 * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_dirty(handle, inode, path + depth);
 
 			ex3 = &newex;
-			ex3->ee_block = cpu_to_le32(iblock);
+			ex3->ee_block = cpu_to_le32(map->m_lblk);
 			ext4_ext_store_pblock(ex3, newblock);
 			ex3->ee_len = cpu_to_le16(allocated);
 			err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				ex->ee_len   = orig_ex.ee_len;
 				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 				ext4_ext_dirty(handle, inode, path + depth);
-				/* blocks available from iblock */
+				/* blocks available from map->m_lblk */
 				return allocated;
 
 			} else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				 */
 				depth = ext_depth(inode);
 				ext4_ext_drop_refs(path);
-				path = ext4_ext_find_extent(inode,
-								iblock, path);
+				path = ext4_ext_find_extent(inode, map->m_lblk,
+							    path);
 				if (IS_ERR(path)) {
 					err = PTR_ERR(path);
 					return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			return allocated;
 		}
 		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-		if (err == -ENOSPC) {
+		if (err == -ENOSPC && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 
 		} else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 * update the extent length after successful insert of the
 		 * split extent
 		 */
-		orig_ex.ee_len = cpu_to_le16(ee_len -
-						ext4_ext_get_actual_len(ex3));
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, iblock, path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		if (err)
 			goto out;
 
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
 		 * to insert a extent in the middle zerout directly
 		 * otherwise give the extent a chance to merge to left
 		 */
 		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-							iblock != ee_block) {
+			map->m_lblk != ee_block && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zero out the first half */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 		}
 	}
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	if (ex1 && ex1 != ex) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
-	/* ex2: iblock to iblock + maxblocks-1 : initialised */
-	ex2->ee_block = cpu_to_le32(iblock);
+	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
 	ext4_ext_store_pblock(ex2, newblock);
 	ex2->ee_len = cpu_to_le16(allocated);
 	if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-	if (err == -ENOSPC) {
+	if (err == -ENOSPC && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
 }
 
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
  * ext4_get_blocks_dio_write() when DIO to write
  * to an uninitialized extent.
  *
@@ -2927,9 +2946,8 @@ fix_extent_len:
  */
 static int ext4_split_unwritten_extents(handle_t *handle,
 					struct inode *inode,
+					struct ext4_map_blocks *map,
 					struct ext4_ext_path *path,
-					ext4_lblk_t iblock,
-					unsigned int max_blocks,
 					int flags)
 {
 	struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
+	int may_zeroout;
+
+	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 
-	ext_debug("ext4_split_unwritten_extents: inode %lu,"
-		  "iblock %llu, max_blocks %u\n", inode->i_ino,
-		  (unsigned long long)iblock, max_blocks);
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
 	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
 	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
+	/*
  	 * If the uninitialized extent begins at the same logical
  	 * block where the write begins, and the write completely
  	 * covers the extent, then we don't need to split it.
  	 */
-	if ((iblock == ee_block) && (allocated <= max_blocks))
+	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
 		return allocated;
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
 	 * overlap of blocks.
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
 	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unsigned int newdepth;
 		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-		if (err == -ENOSPC) {
+		if (err == -ENOSPC && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 
 		} else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		 * update the extent length after successful insert of the
 		 * split extent
 		 */
-		orig_ex.ee_len = cpu_to_le16(ee_len -
-						ext4_ext_get_actual_len(ex3));
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, iblock, path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		if (err)
 			goto out;
 
-		allocated = max_blocks;
+		allocated = map->m_len;
 	}
 	/*
 	 * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	 */
 	if (ex1 && ex1 != ex) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
 	/*
-	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
-	 * uninitialised still.
+	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
+	 * using direct I/O, uninitialised still.
 	 */
-	ex2->ee_block = cpu_to_le32(iblock);
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
 	ext4_ext_store_pblock(ex2, newblock);
 	ex2->ee_len = cpu_to_le16(allocated);
 	ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-	if (err == -ENOSPC) {
+	if (err == -ENOSPC && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock, unsigned int max_blocks,
+			struct ext4_map_blocks *map,
 			struct ext4_ext_path *path, int flags,
-			unsigned int allocated, struct buffer_head *bh_result,
-			ext4_fsblk_t newblock)
+			unsigned int allocated, ext4_fsblk_t newblock)
 {
 	int ret = 0;
 	int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
 		  "block %llu, max_blocks %u, flags %d, allocated %u",
-		  inode->i_ino, (unsigned long long)iblock, max_blocks,
+		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
 	/* get_block() before submit the IO, split the extent */
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-		ret = ext4_split_unwritten_extents(handle,
-						inode, path, iblock,
-						max_blocks, flags);
+		ret = ext4_split_unwritten_extents(handle, inode, map,
+						   path, flags);
 		/*
 		 * Flag the inode(non aio case) or end_io struct (aio case)
 		 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
-			set_buffer_uninit(bh_result);
+			map->m_flags |= EXT4_MAP_UNINIT;
 		goto out;
 	}
 	/* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * the buffer head will be unmapped so that
 		 * a read from the block returns 0s.
 		 */
-		set_buffer_unwritten(bh_result);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
 		goto out1;
 	}
 
 	/* buffered write, writepage time, convert*/
-	ret = ext4_ext_convert_to_initialized(handle, inode,
-						path, iblock,
-						max_blocks);
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
 	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3226,7 +3256,7 @@ out:
 		goto out2;
 	} else
 		allocated = ret;
-	set_buffer_new(bh_result);
+	map->m_flags |= EXT4_MAP_NEW;
 	/*
 	 * if we allocated more blocks than requested
 	 * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
 	 * unmapped later when we find the buffer_head marked
 	 * new.
 	 */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-					newblock + max_blocks,
-					allocated - max_blocks);
-		allocated = max_blocks;
+					newblock + map->m_len,
+					allocated - map->m_len);
+		allocated = map->m_len;
 	}
 
 	/*
@@ -3252,13 +3282,13 @@ out:
 		ext4_da_update_reserve_space(inode, allocated, 0);
 
 map_out:
-	set_buffer_mapped(bh_result);
+	map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
  *
  * return < 0, error case.
  */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock,
-			unsigned int max_blocks, struct buffer_head *bh_result,
-			int flags)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex, *last_ex;
 	ext4_fsblk_t newblock;
-	int err = 0, depth, ret, cache_type;
+	int i, err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
-	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %lu\n",
-			iblock, max_blocks, inode->i_ino);
+		  map->m_lblk, map->m_len, inode->i_ino);
 
 	/* check in cache */
-	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
 	if (cache_type) {
 		if (cache_type == EXT4_EXT_CACHE_GAP) {
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			/* we should allocate requested block */
 		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
 			/* block is already allocated */
-			newblock = iblock
+			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
 				   + ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
 			allocated = ext4_ext_get_actual_len(&newex) -
-					(iblock - le32_to_cpu(newex.ee_block));
+				(map->m_lblk - le32_to_cpu(newex.ee_block));
 			goto out;
 		} else {
 			BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/* find extent for this block */
-	path = ext4_ext_find_extent(inode, iblock, NULL);
+	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
 	if (IS_ERR(path)) {
 		err = PTR_ERR(path);
 		path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
 		EXT4_ERROR_INODE(inode, "bad extent address "
-				 "iblock: %d, depth: %d pblock %lld",
-				 iblock, depth, path[depth].p_block);
+				 "lblock: %lu, depth: %d pblock %lld",
+				 (unsigned long) map->m_lblk, depth,
+				 path[depth].p_block);
 		err = -EIO;
 		goto out2;
 	}
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
-		if (in_range(iblock, ee_block, ee_len)) {
-			newblock = iblock - ee_block + ee_start;
+		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
-			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
-					ee_block, ee_len, newblock);
+			allocated = ee_len - (map->m_lblk - ee_block);
+			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+				  ee_block, ee_len, newblock);
 
 			/* Do not put uninitialized extent in the cache */
 			if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				goto out;
 			}
 			ret = ext4_ext_handle_uninitialized_extents(handle,
-					inode, iblock, max_blocks, path,
-					flags, allocated, bh_result, newblock);
+					inode, map, path, flags, allocated,
+					newblock);
 			return ret;
 		}
 	}
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, iblock);
+		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
 	/*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 
 	/* find neighbour allocated blocks */
-	ar.lleft = iblock;
+	ar.lleft = map->m_lblk;
 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
 	if (err)
 		goto out2;
-	ar.lright = iblock;
+	ar.lright = map->m_lblk;
 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
 	if (err)
 		goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
 	 * EXT_UNINIT_MAX_LEN.
 	 */
-	if (max_blocks > EXT_INIT_MAX_LEN &&
+	if (map->m_len > EXT_INIT_MAX_LEN &&
 	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-		max_blocks = EXT_INIT_MAX_LEN;
-	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+		map->m_len = EXT_INIT_MAX_LEN;
+	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
 		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-		max_blocks = EXT_UNINIT_MAX_LEN;
+		map->m_len = EXT_UNINIT_MAX_LEN;
 
-	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
-	newex.ee_block = cpu_to_le32(iblock);
-	newex.ee_len = cpu_to_le16(max_blocks);
+	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	newex.ee_len = cpu_to_le16(map->m_len);
 	err = ext4_ext_check_overlap(inode, &newex, path);
 	if (err)
 		allocated = ext4_ext_get_actual_len(&newex);
 	else
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 	/* allocate new block */
 	ar.inode = inode;
-	ar.goal = ext4_ext_find_goal(inode, path, iblock);
-	ar.logical = iblock;
+	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+	ar.logical = map->m_lblk;
 	ar.len = allocated;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
 		if (ext4_should_dioread_nolock(inode))
-			set_buffer_uninit(bh_result);
+			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
 		if (unlikely(!eh->eh_entries)) {
 			EXT4_ERROR_INODE(inode,
-					 "eh->eh_entries == 0 ee_block %d",
-					 ex->ee_block);
+					 "eh->eh_entries == 0 and "
+					 "EOFBLOCKS_FL set");
 			err = -EIO;
 			goto out2;
 		}
 		last_ex = EXT_LAST_EXTENT(eh);
-		if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
-		    + ext4_ext_get_actual_len(last_ex))
-			EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+		/*
+		 * If the current leaf block was reached by looking at
+		 * the last index block all the way down the tree, and
+		 * we are extending the inode beyond the last extent
+		 * in the current leaf block, then clear the
+		 * EOFBLOCKS_FL flag.
+		 */
+		for (i = depth-1; i >= 0; i--) {
+			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+				break;
+		}
+		if ((i < 0) &&
+		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+		     ext4_ext_get_actual_len(last_ex)))
+			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
-	if (allocated > max_blocks)
-		allocated = max_blocks;
-	set_buffer_new(bh_result);
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_flags |= EXT4_MAP_NEW;
 
 	/*
 	 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * when it is _not_ an uninitialized extent.
 	 */
 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 	} else
 		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	set_buffer_mapped(bh_result);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
 		 * can proceed even if the new size is the same as i_size.
 		 */
 		if (new_size > i_size_read(inode))
-			EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 
 }
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
 	handle_t *handle;
-	ext4_lblk_t block;
 	loff_t new_size;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
-	struct buffer_head map_bh;
+	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
 	/*
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
 	/* preallocation to directories is currently not supported */
 	if (S_ISDIR(inode->i_mode))
 		return -ENODEV;
 
-	block = offset >> blkbits;
+	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-							- block;
+		- map.m_lblk;
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, (len + offset));
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
 retry:
 	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
+		map.m_lblk = map.m_lblk + ret;
+		map.m_len = max_blocks = max_blocks - ret;
 		handle = ext4_journal_start(inode, credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
 		}
-		map_bh.b_state = 0;
-		ret = ext4_get_blocks(handle, inode, block,
-				      max_blocks, &map_bh,
+		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
 				    inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
 			ret2 = ext4_journal_stop(handle);
 			break;
 		}
-		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
 						blkbits) >> blkbits))
 			new_size = offset + len;
 		else
-			new_size = (block + ret) << blkbits;
+			new_size = (map.m_lblk + ret) << blkbits;
 
 		ext4_falloc_update_inode(inode, mode, new_size,
-						buffer_new(&map_bh));
+					 (map.m_flags & EXT4_MAP_NEW));
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
 		if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 				    ssize_t len)
 {
 	handle_t *handle;
-	ext4_lblk_t block;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
-	struct buffer_head map_bh;
+	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
-	block = offset >> blkbits;
+	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
-	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-							- block;
+	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+		      map.m_lblk);
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
+		map.m_lblk += ret;
+		map.m_len = (max_blocks -= ret);
 		handle = ext4_journal_start(inode, credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
 		}
-		map_bh.b_state = 0;
-		ret = ext4_get_blocks(handle, inode, block,
-				      max_blocks, &map_bh,
+		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
-				    inode->i_ino, block, max_blocks);
+				    inode->i_ino, map.m_lblk, map.m_len);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	int error = 0;
 
 	/* fallback to generic here if not in extents fmt */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return generic_block_fiemap(inode, fieinfo, start, len,
 			ext4_get_block);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f3..5313ae4cda2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 	 * is smaller than s_maxbytes, which is for extent-mapped files.
 	 */
 
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		size_t length = iov_length(iov, nr_segs);
 
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67c..b6a74f991bf 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+	struct dentry *dentry = NULL;
+
+	while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+		dentry = list_entry(inode->i_dentry.next,
+				    struct dentry, d_alias);
+		if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+			break;
+		inode = dentry->d_parent->d_inode;
+		sync_mapping_buffers(inode->i_mapping);
+	}
+}
+
+/*
  * akpm: A new design for ext4_sync_file().
  *
  * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	ret = flush_completed_IO(inode);
 	if (ret < 0)
 		return ret;
-	
-	if (!journal)
-		return simple_fsync(file, dentry, datasync);
+
+	if (!journal) {
+		ret = simple_fsync(file, dentry, datasync);
+		if (!ret && !list_empty(&inode->i_dentry))
+			ext4_sync_parent(inode);
+		return ret;
+	}
 
 	/*
 	 * data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
 					NULL, BLKDEV_IFL_WAIT);
-		jbd2_log_wait_commit(journal, commit_tid);
+		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
 			BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f0..25c4b3173fd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	if (fatal)
 		goto error_return;
 
-	/* Ok, now we can actually update the inode bitmaps.. */
-	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-					bit, bitmap_bh->b_data);
-	if (!cleared)
-		ext4_error(sb, "bit already cleared for inode %lu", ino);
-	else {
-		gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+	fatal = -ESRCH;
+	gdp = ext4_get_group_desc(sb, block_group, &bh2);
+	if (gdp) {
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
-		if (fatal) goto error_return;
-
-		if (gdp) {
-			ext4_lock_group(sb, block_group);
-			count = ext4_free_inodes_count(sb, gdp) + 1;
-			ext4_free_inodes_set(sb, gdp, count);
-			if (is_directory) {
-				count = ext4_used_dirs_count(sb, gdp) - 1;
-				ext4_used_dirs_set(sb, gdp, count);
-				if (sbi->s_log_groups_per_flex) {
-					ext4_group_t f;
-
-					f = ext4_flex_group(sbi, block_group);
-					atomic_dec(&sbi->s_flex_groups[f].used_dirs);
-				}
+	}
+	ext4_lock_group(sb, block_group);
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	if (fatal || !cleared) {
+		ext4_unlock_group(sb, block_group);
+		goto out;
+	}
 
-			}
-			gdp->bg_checksum = ext4_group_desc_csum(sbi,
-							block_group, gdp);
-			ext4_unlock_group(sb, block_group);
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
-			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
-
-			if (sbi->s_log_groups_per_flex) {
-				ext4_group_t f;
-
-				f = ext4_flex_group(sbi, block_group);
-				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-			}
-		}
-		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-		err = ext4_handle_dirty_metadata(handle, NULL, bh2);
-		if (!fatal) fatal = err;
+	count = ext4_free_inodes_count(sb, gdp) + 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (is_directory) {
+		count = ext4_used_dirs_count(sb, gdp) - 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		percpu_counter_dec(&sbi->s_dirs_counter);
 	}
-	BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-	if (!fatal)
-		fatal = err;
-	sb->s_dirt = 1;
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	percpu_counter_inc(&sbi->s_freeinodes_counter);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		if (is_directory)
+			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+	}
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+	if (cleared) {
+		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!fatal)
+			fatal = err;
+		sb->s_dirt = 1;
+	} else
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
+
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 
 	if (S_ISDIR(mode) &&
 	    ((parent == sb->s_root->d_inode) ||
-	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
 		int best_ndir = inodes_per_group;
 		int ret = -1;
 
@@ -1041,7 +1034,7 @@ got:
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
 			ext4_ext_tree_init(handle, inode);
 		}
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08..19df61c321f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 	int ret;
 
 	/*
-	 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 	 * moment, get_block can be called only for blocks inside i_size since
 	 * page cache has been already dropped and writes are blocked by
 	 * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
-			__ext4_error(inode->i_sb, function,
-				   "invalid block reference %u "
-				   "in inode #%lu", blk, inode->i_ino);
+			ext4_error_inode(function, inode,
+					 "invalid block reference %u", blk);
 			return -EIO;
 		}
 	}
@@ -785,7 +784,7 @@ failed:
 	/* Allocation failed, free what we already allocated */
 	ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
-		/* 
+		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 
 err_out:
 	for (i = 1; i <= num; i++) {
-		/* 
+		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
 }
 
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
  * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
  *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
  * blocks.
  */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t iblock, unsigned int maxblocks,
-			       struct buffer_head *bh_result,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map,
 			       int flags)
 {
 	int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 
-	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-	depth = ext4_block_to_path(inode, iblock, offsets,
+	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
 
 	if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
-		clear_buffer_new(bh_result);
 		count++;
 		/*map more blocks*/
-		while (count < maxblocks && count <= blocks_to_boundary) {
+		while (count < map->m_len && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	*/
-	goal = ext4_find_goal(inode, iblock, partial);
+	goal = ext4_find_goal(inode, map->m_lblk, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext4_blks_to_allocate(partial, indirect_blks,
-					maxblocks, blocks_to_boundary);
+				      map->m_len, blocks_to_boundary);
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
 				&count, goal,
 				offsets + (partial - chain), partial);
 
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
-		err = ext4_splice_branch(handle, inode, iblock,
+		err = ext4_splice_branch(handle, inode, map->m_lblk,
 					 partial, indirect_blks, count);
 	if (err)
 		goto cleanup;
 
-	set_buffer_new(bh_result);
+	map->m_flags |= EXT4_MAP_NEW;
 
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = le32_to_cpu(chain[depth-1].key);
+	map->m_len = count;
 	if (count > blocks_to_boundary)
-		set_buffer_boundary(bh_result);
+		map->m_flags |= EXT4_MAP_BOUNDARY;
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
 		brelse(partial->bh);
 		partial--;
 	}
-	BUFFER_TRACE(bh_result, "returned");
 out:
 	return err;
 }
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
  */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_calc_metadata_amount(inode, lblock);
 
 	return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int mdb_free = 0, allocated_meta_blocks = 0;
 
 	spin_lock(&ei->i_block_reservation_lock);
 	trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
-	used += ei->i_allocated_meta_blocks;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-	allocated_meta_blocks = ei->i_allocated_meta_blocks;
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+			   used + ei->i_allocated_meta_blocks);
 	ei->i_allocated_meta_blocks = 0;
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
 
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		mdb_free = ei->i_reserved_meta_blocks;
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
 	}
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	/* Update quota subsystem */
-	if (quota_claim) {
+	/* Update quota subsystem for data blocks */
+	if (quota_claim)
 		dquot_claim_block(inode, used);
-		if (mdb_free)
-			dquot_release_reservation_block(inode, mdb_free);
-	} else {
+	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
-		 * not update the quota for allocated blocks. But then
-		 * converting an fallocate region to initialized region would
-		 * have caused a metadata allocation. So claim quota for
-		 * that
+		 * not re-claim the quota for fallocated blocks.
 		 */
-		if (allocated_meta_blocks)
-			dquot_claim_block(inode, allocated_meta_blocks);
-		dquot_release_reservation_block(inode, mdb_free + used);
+		dquot_release_reservation_block(inode, used);
 	}
 
 	/*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		ext4_discard_preallocations(inode);
 }
 
-static int check_block_validity(struct inode *inode, const char *msg,
-				sector_t logical, sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *func,
+				struct ext4_map_blocks *map)
 {
-	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-		__ext4_error(inode->i_sb, msg,
-			   "inode #%lu logical block %llu mapped to %llu "
-			   "(size %d)", inode->i_ino,
-			   (unsigned long long) logical,
-			   (unsigned long long) phys, len);
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+				   map->m_len)) {
+		ext4_error_inode(func, inode,
+			   "lblock %lu mapped to illegal pblock %llu "
+			   "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_pblk, map->m_len);
 		return -EIO;
 	}
 	return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-		    unsigned int max_blocks, struct buffer_head *bh,
-		    int flags)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+		    struct ext4_map_blocks *map, int flags)
 {
 	int retval;
 
-	clear_buffer_mapped(bh);
-	clear_buffer_unwritten(bh);
-
-	ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
-		  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-		  (unsigned long)block);
+	map->m_flags = 0;
+	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+		  (unsigned long) map->m_lblk);
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, 0);
 	} else {
-		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-					     bh, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
-	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, "file system corruption",
-					       block, bh->b_blocknr, retval);
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		int ret = check_block_validity(inode, __func__, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * ext4_ext_get_block() returns th create = 0
 	 * with buffer head unmapped.
 	 */
-	if (retval > 0 && buffer_mapped(bh))
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		return retval;
 
 	/*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * of BH_Unwritten and BH_Mapped flags being simultaneously
 	 * set on the buffer_head.
 	 */
-	clear_buffer_unwritten(bh);
+	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-					      bh, flags);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
-		retval = ext4_ind_get_blocks(handle, inode, block,
-					     max_blocks, bh, flags);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 
-		if (retval > 0 && buffer_new(bh)) {
+		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 
 	up_write((&EXT4_I(inode)->i_data_sem));
-	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, "file system "
-					       "corruption after allocation",
-					       block, bh->b_blocknr, retval);
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		int ret = check_block_validity(inode,
+					       "ext4_map_blocks_after_alloc",
+					       map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
-int ext4_get_block(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create)
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int dio_credits;
 
-	if (create && !handle) {
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
+
+	if (flags && !handle) {
 		/* Direct IO write... */
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 		handle = ext4_journal_start(inode, dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			goto out;
+			return ret;
 		}
 		started = 1;
 	}
 
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
-out:
 	return ret;
 }
 
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh, int create)
+{
+	return _ext4_get_block(inode, iblock, bh,
+			       create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
-	struct buffer_head dummy;
+	struct ext4_map_blocks map;
+	struct buffer_head *bh;
 	int fatal = 0, err;
-	int flags = 0;
 
 	J_ASSERT(handle != NULL || create == 0);
 
-	dummy.b_state = 0;
-	dummy.b_blocknr = -1000;
-	buffer_trace_init(&dummy.b_history);
-	if (create)
-		flags |= EXT4_GET_BLOCKS_CREATE;
-	err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
-	/*
-	 * ext4_get_blocks() returns number of blocks mapped. 0 in
-	 * case of a HOLE.
-	 */
-	if (err > 0) {
-		if (err > 1)
-			WARN_ON(1);
-		err = 0;
+	map.m_lblk = block;
+	map.m_len = 1;
+	err = ext4_map_blocks(handle, inode, &map,
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+	if (err < 0)
+		*errp = err;
+	if (err <= 0)
+		return NULL;
+	*errp = 0;
+
+	bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (!bh) {
+		*errp = -EIO;
+		return NULL;
 	}
-	*errp = err;
-	if (!err && buffer_mapped(&dummy)) {
-		struct buffer_head *bh;
-		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (!bh) {
-			*errp = -EIO;
-			goto err;
-		}
-		if (buffer_new(&dummy)) {
-			J_ASSERT(create != 0);
-			J_ASSERT(handle != NULL);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		J_ASSERT(create != 0);
+		J_ASSERT(handle != NULL);
 
-			/*
-			 * Now that we do not always journal data, we should
-			 * keep in mind whether this should always journal the
-			 * new buffer as metadata.  For now, regular file
-			 * writes use ext4_get_block instead, so it's not a
-			 * problem.
-			 */
-			lock_buffer(bh);
-			BUFFER_TRACE(bh, "call get_create_access");
-			fatal = ext4_journal_get_create_access(handle, bh);
-			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-			err = ext4_handle_dirty_metadata(handle, inode, bh);
-			if (!fatal)
-				fatal = err;
-		} else {
-			BUFFER_TRACE(bh, "not a new buffer");
-		}
-		if (fatal) {
-			*errp = fatal;
-			brelse(bh);
-			bh = NULL;
+		/*
+		 * Now that we do not always journal data, we should
+		 * keep in mind whether this should always journal the
+		 * new buffer as metadata.  For now, regular file
+		 * writes use ext4_get_block instead, so it's not a
+		 * problem.
+		 */
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		fatal = ext4_journal_get_create_access(handle, bh);
+		if (!fatal && !buffer_uptodate(bh)) {
+			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
 		}
-		return bh;
+		unlock_buffer(bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		BUFFER_TRACE(bh, "not a new buffer");
 	}
-err:
-	return NULL;
+	if (fatal) {
+		*errp = fatal;
+		brelse(bh);
+		bh = NULL;
+	}
+	return bh;
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long md_needed, md_reserved;
+	unsigned long md_needed;
 	int ret;
 
 	/*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
-	md_reserved = ei->i_reserved_meta_blocks;
 	md_needed = ext4_calc_metadata_amount(inode, lblock);
 	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 
 	/*
-	 * Make quota reservation here to prevent quota overflow
-	 * later. Real quota accounting is done at pages writeout
-	 * time.
+	 * We will charge metadata quota at writeout time; this saves
+	 * us from metadata over-estimation, though we may go over by
+	 * a small amount in the end.  Here we just reserve for data.
 	 */
-	ret = dquot_reserve_block(inode, md_needed + 1);
+	ret = dquot_reserve_block(inode, 1);
 	if (ret)
 		return ret;
-
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
 	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-		dquot_release_reservation_block(inode, md_needed + 1);
+		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 
+	trace_ext4_da_release_space(inode, to_free);
 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		to_free += ei->i_reserved_meta_blocks;
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 
-	/* update fs dirty blocks counter */
+	/* update fs dirty data blocks counter */
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
  *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
  * the function goes through all passed space and put actual disk
  * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
-				 struct buffer_head *exbh)
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
+				 struct ext4_map_blocks *map)
 {
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	int blocks = exbh->b_size >> inode->i_blkbits;
-	sector_t pblock = exbh->b_blocknr, cur_logical;
+	int blocks = map->m_len;
+	sector_t pblock = map->m_pblk, cur_logical;
 	struct buffer_head *head, *bh;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	int nr_pages, i;
 
-	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 
 			/* skip blocks out of the range */
 			do {
-				if (cur_logical >= logical)
+				if (cur_logical >= map->m_lblk)
 					break;
 				cur_logical++;
 			} while ((bh = bh->b_this_page) != head);
 
 			do {
-				if (cur_logical >= logical + blocks)
+				if (cur_logical >= map->m_lblk + blocks)
 					break;
 
-				if (buffer_delay(bh) ||
-						buffer_unwritten(bh)) {
+				if (buffer_delay(bh) || buffer_unwritten(bh)) {
 
 					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
 
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
-				if (buffer_uninit(exbh))
+				if (map->m_flags & EXT4_MAP_UNINIT)
 					set_buffer_uninit(bh);
 				cur_logical++;
 				pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
 
 
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-					     struct buffer_head *bh)
-{
-	struct block_device *bdev = inode->i_sb->s_bdev;
-	int blocks, i;
-
-	blocks = bh->b_size >> inode->i_blkbits;
-	for (i = 0; i < blocks; i++)
-		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 					sector_t logical, long blk_cnt)
 {
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
-	struct buffer_head new;
+	struct ext4_map_blocks map;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
 	 * variables are updated after the blocks have been allocated.
 	 */
-	new.b_state = 0;
+	map.m_lblk = next;
+	map.m_len = max_blocks;
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
 	if (ext4_should_dioread_nolock(mpd->inode))
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (mpd->b_state & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 
-	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-			       &new, get_blocks_flags);
+	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 	if (blks < 0) {
 		err = blks;
 		/*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		ext4_msg(mpd->inode->i_sb, KERN_CRIT,
 			 "delayed block allocation failed for inode %lu at "
 			 "logical offset %llu with max blocks %zd with "
-			 "error %d\n", mpd->inode->i_ino,
+			 "error %d", mpd->inode->i_ino,
 			 (unsigned long long) next,
 			 mpd->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_CRIT "This should not happen!!  "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 	BUG_ON(blks == 0);
 
-	new.b_size = (blks << mpd->inode->i_blkbits);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+		int i;
 
-	if (buffer_new(&new))
-		__unmap_underlying_blocks(mpd->inode, &new);
+		for (i = 0; i < map.m_len; i++)
+			unmap_underlying_metadata(bdev, map.m_pblk + i);
+	}
 
 	/*
 	 * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 */
 	if ((mpd->b_state & (1 << BH_Delay)) ||
 	    (mpd->b_state & (1 << BH_Unwritten)))
-		mpage_put_bnr_to_bhs(mpd, next, &new);
+		mpage_put_bnr_to_bhs(mpd, &map);
 
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	sector_t next;
 	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 
+	/*
+	 * XXX Don't go larger than mballoc is willing to allocate
+	 * This is a stopgap solution.  We eventually need to fold
+	 * mpage_da_submit_io() into this function and then call
+	 * ext4_get_blocks() multiple times in a loop
+	 */
+	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+		goto flush_it;
+
 	/* check if thereserved journal credits might overflow */
-	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
 			/*
 			 * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
 	struct buffer_head *bh, *head;
 	sector_t logical;
 
-	if (mpd->io_done) {
-		/*
-		 * Rest of the page in the page_vec
-		 * redirty then and skip then. We will
-		 * try to write them again after
-		 * starting a new transaction
-		 */
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return MPAGE_DA_EXTENT_TAIL;
-	}
 	/*
 	 * Can we merge this page to current extent?
 	 */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
  * initialized properly.
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-				  struct buffer_head *bh_result, int create)
+				  struct buffer_head *bh, int create)
 {
+	struct ext4_map_blocks map;
 	int ret = 0;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		invalid_block = ~0;
 
 	BUG_ON(create == 0);
-	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+	map.m_lblk = iblock;
+	map.m_len = 1;
 
 	/*
 	 * first, we need to know whether the block is allocated already
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
-	if ((ret == 0) && !buffer_delay(bh_result)) {
-		/* the block isn't (pre)allocated yet, let's reserve space */
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		if (buffer_delay(bh))
+			return 0; /* Not sure this could or should happen */
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 			/* not enough space to reserve */
 			return ret;
 
-		map_bh(bh_result, inode->i_sb, invalid_block);
-		set_buffer_new(bh_result);
-		set_buffer_delay(bh_result);
-	} else if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		if (buffer_unwritten(bh_result)) {
-			/* A delayed write to unwritten bh should
-			 * be marked new and mapped.  Mapped ensures
-			 * that we don't do get_block multiple times
-			 * when we write to the same offset and new
-			 * ensures that we do proper zero out for
-			 * partial write.
-			 */
-			set_buffer_new(bh_result);
-			set_buffer_mapped(bh_result);
-		}
-		ret = 0;
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+		return 0;
 	}
 
-	return ret;
+	map_bh(bh, inode->i_sb, map.m_pblk);
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+	if (buffer_unwritten(bh)) {
+		/* A delayed write to unwritten bh should be marked
+		 * new and mapped.  Mapped ensures that we don't do
+		 * get_block multiple times when we write to the same
+		 * offset and new ensures that we do proper zero out
+		 * for partial write.
+		 */
+		set_buffer_new(bh);
+		set_buffer_mapped(bh);
+	}
+	return 0;
 }
 
 /*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
-	int ret = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-
-	/*
-	 * we don't want to do block allocation in writepage
-	 * so call get_block_wrap with create = 0
-	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result, 0);
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 	 * number of contiguous block. So we will limit
 	 * number of contiguous block to a sane value
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
 	    (max_blocks > EXT4_MAX_TRANS_DATA))
 		max_blocks = EXT4_MAX_TRANS_DATA;
 
 	return ext4_chunk_trans_blocks(inode, max_blocks);
 }
 
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *	Range cyclic is ignored.
+ *	no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+				struct writeback_control *wbc,
+				struct mpage_da_data *mpd)
+{
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	long nr_to_write = wbc->nr_to_write;
+
+	pagevec_init(&pvec, 0);
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	while (!done && (index <= end)) {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 */
+			if (page->index > end) {
+				done = 1;
+				break;
+			}
+
+			lock_page(page);
+
+			/*
+			 * Page truncated or invalidated. We can freely skip it
+			 * then, even for data integrity operations: the page
+			 * has disappeared concurrently, so there could be no
+			 * real expectation of this data interity operation
+			 * even if there is now a new, dirty page at the same
+			 * pagecache address.
+			 */
+			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (PageWriteback(page)) {
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					wait_on_page_writeback(page);
+				else
+					goto continue_unlock;
+			}
+
+			BUG_ON(PageWriteback(page));
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			ret = __mpage_da_writepage(page, wbc, mpd);
+			if (unlikely(ret)) {
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+				} else {
+					done = 1;
+					break;
+				}
+			}
+
+			if (nr_to_write > 0) {
+				nr_to_write--;
+				if (nr_to_write == 0 &&
+				    wbc->sync_mode == WB_SYNC_NONE) {
+					/*
+					 * We stop writing back only if we are
+					 * not doing integrity sync. In case of
+					 * integrity sync we have to keep going
+					 * because someone may be concurrently
+					 * dirtying pages, and we might have
+					 * synced a lot of newly appeared dirty
+					 * pages, but have not synced all of the
+					 * old dirty pages.
+					 */
+					done = 1;
+					break;
+				}
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+
+
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
-	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
 	unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
 
-	/*
-	 * we don't want write_cache_pages to update
-	 * nr_to_write and writeback_index
-	 */
-	no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
 retry:
@@ -2941,7 +3011,7 @@ retry:
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-			       "%ld pages, ino %lu; err %d\n", __func__,
+			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			goto out_writepages;
 		}
@@ -2963,8 +3033,7 @@ retry:
 		mpd.io_done = 0;
 		mpd.pages_written = 0;
 		mpd.retval = 0;
-		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
-					&mpd);
+		ret = write_cache_pages_da(mapping, wbc, &mpd);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
 	if (pages_skipped != wbc->pages_skipped)
 		ext4_msg(inode->i_sb, KERN_CRIT,
 			 "This should not happen leaving %s "
-			 "with nr_to_write = %ld ret = %d\n",
+			 "with nr_to_write = %ld ret = %d",
 			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
@@ -3030,8 +3099,6 @@ retry:
 		mapping->writeback_index = index;
 
 out_writepages:
-	if (!no_nrwrite_index_update)
-		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
 {
-	int ret, retries = 0, quota_retries = 0;
+	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-
-	if ((ret == -EDQUOT) &&
-	    EXT4_I(inode)->i_reserved_meta_blocks &&
-	    (quota_retries++ < 3)) {
-		/*
-		 * Since we often over-estimate the number of meta
-		 * data blocks required, we may sometimes get a
-		 * spurios out of quota error even though there would
-		 * be enough space once we write the data blocks and
-		 * find out how many meta data blocks were _really_
-		 * required.  So try forcing the inode write to see if
-		 * that helps.
-		 */
-		write_inode_now(inode, (quota_retries == 3));
-		goto retry;
-	}
 out:
 	return ret;
 }
@@ -3546,46 +3597,18 @@ out:
 	return ret;
 }
 
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	handle_t *handle = ext4_journal_current_handle();
-	int ret = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	int dio_credits;
-	int started = 0;
-
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	/*
-	 * ext4_get_block in prepare for a DIO write or buffer write.
-	 * We allocate an uinitialized extent if blocks haven't been allocated.
-	 * The extent will be converted to initialized after IO complete.
-	 */
-	create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-
-	if (!handle) {
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-		handle = ext4_journal_start(inode, dio_credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-		started = 1;
-	}
-
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	if (started)
-		ext4_journal_stop(handle);
-out:
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
 static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
 
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {
-		ext4_error(inode->i_sb, "inode #%lu: "
-			   "attempt to clear blocks %llu len %lu, invalid",
-			   inode->i_ino, (unsigned long long) block_to_free,
-			   count);
+		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+				 "blocks %llu len %lu",
+				 (unsigned long long) block_to_free, count);
 		return 1;
 	}
 
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
-			ext4_error(inode->i_sb,
-				   "circular indirect block detected, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long) this_bh->b_blocknr);
+			EXT4_ERROR_INODE(inode,
+					 "circular indirect block detected at "
+					 "block %llu",
+				(unsigned long long) this_bh->b_blocknr);
 	}
 }
 
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 
 			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						   nr, 1)) {
-				ext4_error(inode->i_sb,
-					   "indirect mapped block in inode "
-					   "#%lu invalid (level %d, blk #%lu)",
-					   inode->i_ino, depth,
-					   (unsigned long) nr);
+				EXT4_ERROR_INODE(inode,
+						 "invalid indirect mapped "
+						 "block %lu (level %d)",
+						 (unsigned long) nr, depth);
 				break;
 			}
 
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * (should be rare).
 			 */
 			if (!bh) {
-				ext4_error(inode->i_sb,
-					   "Read failure, inode=%lu, block=%llu",
-					   inode->i_ino, nr);
+				EXT4_ERROR_INODE(inode,
+						 "Read failure block=%llu",
+						 (unsigned long long) nr);
 				continue;
 			}
 
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
 		return;
 	}
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error(sb, "unable to read inode block - "
-			   "inode=%lu, block=%llu", inode->i_ino, block);
+		EXT4_ERROR_INODE(inode, "unable to read inode block - "
+				 "block %llu", block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(sb, "unable to read inode block - inode=%lu,"
-				   " block=%llu", inode->i_ino, block);
+			EXT4_ERROR_INODE(inode, "unable to read inode "
+					 "block %llu", block);
 			brelse(bh);
 			return -EIO;
 		}
@@ -5096,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-		ext4_error(sb, "bad extended attribute block %llu inode #%lu",
-			   ei->i_file_acl, inode->i_ino);
+		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
 	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
 		ret = -EIO;
-		ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
-			   inode->i_mode, inode->i_ino);
+		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
@@ -5381,9 +5400,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-			ext4_error(inode->i_sb, "IO error syncing inode, "
-				   "inode=%lu, block=%llu", inode->i_ino,
-				   (unsigned long long)iloc.bh->b_blocknr);
+			EXT4_ERROR_INODE(inode,
+				"IO error syncing inode (block=%llu)",
+				(unsigned long long) iloc.bh->b_blocknr);
 			err = -EIO;
 		}
 		brelse(iloc.bh);
@@ -5455,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
 	    (attr->ia_size < inode->i_size ||
-	     (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
 		handle_t *handle;
 
 		handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			}
 		}
 		/* ext4_truncate will clear the flag */
-		if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
 			ext4_truncate(inode);
 	}
 
@@ -5576,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
 	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5911,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	if (val)
-		EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else
-		EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294..bf5ae883b1b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
 		if (me.moved_len > 0)
 			file_remove_suid(donor_filp);
 
-		if (copy_to_user((struct move_extent __user *)arg, 
+		if (copy_to_user((struct move_extent __user *)arg,
 				 &me, sizeof(me)))
 			err = -EFAULT;
 mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC32_SETRSVSZ:
 		cmd = EXT4_IOC_SETRSVSZ;
 		break;
-	case EXT4_IOC_GROUP_ADD:
+	case EXT4_IOC32_GROUP_ADD: {
+		struct compat_ext4_new_group_input __user *uinput;
+		struct ext4_new_group_input input;
+		mm_segment_t old_fs;
+		int err;
+
+		uinput = compat_ptr(arg);
+		err = get_user(input.group, &uinput->group);
+		err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+		err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+		err |= get_user(input.inode_table, &uinput->inode_table);
+		err |= get_user(input.blocks_count, &uinput->blocks_count);
+		err |= get_user(input.reserved_blocks,
+				&uinput->reserved_blocks);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+				 (unsigned long) &input);
+		set_fs(old_fs);
+		return err;
+	}
+	case EXT4_IOC_MOVE_EXT:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca..12b3bc026a6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	}
 }
 
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 		 */
 		grp->bb_free = free;
 	}
+	mb_set_largest_free_order(sb, grp);
 
 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
  * So it can have information regarding groups_per_page which
  * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
  */
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore == NULL);
 			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_buddy_bitmap_load(sb, group);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore != NULL);
 			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_bitmap_load(sb, group);
 
 			/* see comments in ext4_mb_put_pa() */
 			ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
 	return err;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1036,11 @@ err:
 	return ret;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
 	return ret;
 }
 
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			buddy = buddy2;
 		} while (1);
 	}
+	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_check_buddy(e4b);
 }
 
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 		e4b->bd_info->bb_counters[ord]++;
 	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
 	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 	}
 
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 		ext4_mb_use_best_found(ac, e4b);
 	}
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	}
 }
 
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 				ext4_group_t group, int cr)
 {
 	unsigned free, fragments;
-	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
-	BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
 
 	free = grp->bb_free;
 	fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
 
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
 		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
 		    ((group % flex_size) == 0))
 			return 0;
 
-		bits = ac->ac_sb->s_blocksize_bits + 1;
-		for (i = ac->ac_2order; i <= bits; i++)
-			if (grp->bb_counters[i] > 0)
-				return 1;
-		break;
+		return 1;
 	case 1:
 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
 			return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	sbi = EXT4_SB(sb);
 	ngroups = ext4_get_groups_count(sb);
 	/* non-extent files are limited to low blocks/groups */
-	if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
 		ngroups = sbi->s_blockfile_groups;
 
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
-			struct ext4_group_info *grp;
-			struct ext4_group_desc *desc;
-
 			if (group == ngroups)
 				group = 0;
 
-			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(sb, group);
-			if (grp->bb_free == 0)
+			/* This now checks without needing the buddy page */
+			if (!ext4_mb_good_group(ac, group, cr))
 				continue;
 
 			err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
 				goto out;
 
 			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
 			if (!ext4_mb_good_group(ac, group, cr)) {
-				/* someone did allocation from this group */
 				ext4_unlock_group(sb, group);
-				ext4_mb_release_desc(&e4b);
+				ext4_mb_unload_buddy(&e4b);
 				continue;
 			}
 
 			ac->ac_groups_scanned++;
-			desc = ext4_get_group_desc(sb, group, NULL);
 			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
 				ext4_mb_complex_scan_group(ac, &e4b);
 
 			ext4_unlock_group(sb, group);
-			ext4_mb_release_desc(&e4b);
+			ext4_mb_unload_buddy(&e4b);
 
 			if (ac->ac_status != AC_STATUS_CONTINUE)
 				break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	ext4_lock_group(sb, group);
 	memcpy(&sg, ext4_get_group_info(sb, group), i);
 	ext4_unlock_group(sb, group);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 
 	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
 #ifdef DOUBLE_CHECK
 	{
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			 entry->count, entry->group, entry);
 
 		if (test_opt(sb, DISCARD)) {
+			int ret;
 			ext4_fsblk_t discard_block;
 
 			discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			trace_ext4_discard_blocks(sb,
 					(unsigned long long)discard_block,
 					entry->count);
-			sb_issue_discard(sb, discard_block, entry->count);
+			ret = sb_issue_discard(sb, discard_block, entry->count);
+			if (ret == EOPNOTSUPP) {
+				ext4_warning(sb,
+					"discard not supported, disabling");
+				clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+			}
 		}
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		}
 		ext4_unlock_group(sb, entry->group);
 		kmem_cache_free(ext4_free_ext_cachep, entry);
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 	}
 
 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 
 void exit_ext4_mballoc(void)
 {
-	/* 
+	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
 	 */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
 		atomic_inc(&sbi->s_bal_reqs);
 		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-		if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
 			atomic_inc(&sbi->s_bal_success);
 		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
 		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 			continue;
 
 		/* non-extent files can't have physical blocks past 2^32 */
-		if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
 			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
 			continue;
 
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	spin_unlock(&pa->pa_lock);
 
 	grp_blk = pa->pa_pstart;
-	/* 
+	/*
 	 * If doing group-based preallocation, pa_pstart may be in the
 	 * next group when pa is used up
 	 */
@@ -3697,7 +3747,7 @@ out:
 	ext4_unlock_group(sb, group);
 	if (ac)
 		kmem_cache_free(ext4_ac_cachep, ac);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 	put_bh(bitmap_bh);
 	return free;
 }
@@ -3801,7 +3851,7 @@ repeat:
 		if (bitmap_bh == NULL) {
 			ext4_error(sb, "Error reading block bitmap for %u",
 					group);
-			ext4_mb_release_desc(&e4b);
+			ext4_mb_unload_buddy(&e4b);
 			continue;
 		}
 
@@ -3810,7 +3860,7 @@ repeat:
 		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		put_bh(bitmap_bh);
 
 		list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		ext4_mb_release_group_pa(&e4b, pa, ac);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
-			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);
 		}
 	}
 
-	/* 
+	/*
 	 * We need to make sure we don't reuse the freed block until
 	 * after the transaction is committed, which we can do by
 	 * treating the block as metadata, below.  We make an
@@ -4610,7 +4660,7 @@ do_more:
 		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 
 	freed += count;
 
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef4..6f3a27ec30b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 */
 	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
 				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-	    (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EINVAL;
 
 	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc31..3a6c92ac131 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	int depth = ext_depth(orig_inode);
 	int ret;
 
+	start_ext.ee_block = end_ext.ee_block = 0;
 	o_start = o_end = oext = orig_path[depth].p_ext;
 	oext_alen = ext4_ext_get_actual_len(oext);
 	start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	 * new_ext       |-------|
 	 */
 	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		ext4_error(orig_inode->i_sb,
+		EXT4_ERROR_INODE(orig_inode,
 			"new_ext_end(%u) should be less than or equal to "
 			"oext->ee_block(%u) + oext_alen(%d) - 1",
 			new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	while (1) {
 		/* The extent for donor must be found. */
 		if (!dext) {
-			ext4_error(donor_inode->i_sb,
+			EXT4_ERROR_INODE(donor_inode,
 				   "The extent for donor must be found");
 			*err = -EIO;
 			goto out;
 		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			ext4_error(donor_inode->i_sb,
+			EXT4_ERROR_INODE(donor_inode,
 				"Donor offset(%u) and the first block of donor "
 				"extent(%u) should be equal",
 				donor_off,
@@ -976,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	/* Ext4 move extent supports only extent based file */
-	if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: orig file is not extents "
 			"based file [ino:orig %lu]\n", orig_inode->i_ino);
 		return -EOPNOTSUPP;
-	} else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: donor file is not extents "
 			"based file [ino:donor %lu]\n", donor_inode->i_ino);
 		return -EOPNOTSUPP;
@@ -1354,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			if (ret1 < 0)
 				break;
 			if (*moved_len > len) {
-				ext4_error(orig_inode->i_sb,
+				EXT4_ERROR_INODE(orig_inode,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd10..a43e6617b35 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
 		return blocksize;
 	return (len & 65532) | ((len & 3) << 16);
 }
-  
+
 __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
 	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 	if (len == blocksize) {
 		if (blocksize == 65536)
 			return cpu_to_le16(EXT4_MAX_REC_LEN);
-		else 
+		else
 			return cpu_to_le16(0);
 	}
 	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		brelse(bh);
 	}
 	if (bcount)
-		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
 		       levels ? "" : "   ", names, space/bcount,
 		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
 		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
-	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 		if (hinfo.hash_version <= DX_HASH_TEA)
 			hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
 	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				     EXT4_FEATURE_COMPAT_DIR_INDEX))
-		EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 
 /*
@@ -943,8 +943,8 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
-			ext4_error(sb, "reading directory #%lu offset %lu",
-				   dir->i_ino, (unsigned long)block);
+			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+					 (unsigned long) block);
 			brelse(bh);
 			goto next;
 		}
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 		__u32 ino = le32_to_cpu(de->inode);
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
-			ext4_error(dir->i_sb, "bad inode number: %u", ino);
+			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
 		if (unlikely(IS_ERR(inode))) {
 			if (PTR_ERR(inode) == -ESTALE) {
-				ext4_error(dir->i_sb,
-						"deleted inode referenced: %u",
-						ino);
+				EXT4_ERROR_INODE(dir,
+						 "deleted inode referenced: %u",
+						 ino);
 				return ERR_PTR(-EIO);
 			} else {
 				return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	brelse(bh);
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-		ext4_error(child->d_inode->i_sb,
-			   "bad inode number: %u", ino);
+		EXT4_ERROR_INODE(child->d_inode,
+				 "bad parent inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
 
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
 						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
 		ext4_rec_len_from_disk(fde->rec_len, blocksize));
 	if ((char *) de >= (((char *) root) + blocksize)) {
-		ext4_error(dir->i_sb,
-			   "invalid rec_len for '..' in inode %lu",
-			   dir->i_ino);
+		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
 		brelse(bh);
 		return -EIO;
 	}
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 		brelse(bh);
 		return retval;
 	}
-	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
 	data1 = bh2->b_data;
 
 	memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		retval = ext4_dx_add_entry(handle, dentry, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			return retval;
-		EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
 		dx_fallback++;
 		ext4_mark_inode_dirty(handle, dir);
 	}
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
+	if (retval == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
 	return retval;
 }
 
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
 	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
-			ext4_error(inode->i_sb,
-				   "error %d reading directory #%lu offset 0",
-				   err, inode->i_ino);
+			EXT4_ERROR_INODE(inode,
+				"error %d reading directory lblock 0", err);
 		else
 			ext4_warning(inode->i_sb,
 				     "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
 	de = ext4_next_entry(de1, sb->s_blocksize);
 	while (offset < inode->i_size) {
 		if (!bh ||
-			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			unsigned int lblock;
 			err = 0;
 			brelse(bh);
-			bh = ext4_bread(NULL, inode,
-				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+			bh = ext4_bread(NULL, inode, lblock, 0, &err);
 			if (!bh) {
 				if (err)
-					ext4_error(sb,
-						   "error %d reading directory"
-						   " #%lu offset %u",
-						   err, inode->i_ino, offset);
+					EXT4_ERROR_INODE(inode,
+						"error %d reading directory "
+						"lblock %u", err, lblock);
 				offset += sb->s_blocksize;
 				continue;
 			}
@@ -2297,7 +2296,7 @@ retry:
 		}
 	} else {
 		/* clear the extent format for fast symlink */
-		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
 		inode->i_op = &ext4_fast_symlink_inode_operations;
 		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a..6df797eb9ae 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
 		atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d..49d88c0597c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
 	/* Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
@@ -941,6 +942,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
 		seq_puts(seq, ",journal_async_commit");
+	else if (test_opt(sb, JOURNAL_CHECKSUM))
+		seq_puts(seq, ",journal_checksum");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 	if (test_opt(sb, I_VERSION))
@@ -2213,7 +2216,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 			 const char *, size_t);
 	int offset;
 };
@@ -2430,6 +2433,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
 {
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi;
@@ -2793,24 +2797,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext4_count_free_blocks(sb));
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext4_count_free_inodes(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext4_count_dirs(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-	}
-	if (err) {
-		ext4_msg(sb, KERN_ERR, "insufficient memory");
-		goto failed_mount3;
-	}
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 
@@ -2910,6 +2896,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+				  ext4_count_free_blocks(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+					  ext4_count_free_inodes(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+					  ext4_count_dirs(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount_wq;
+	}
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
 			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3001,7 @@ no_journal:
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
-			 "zone (%d)\n", err);
+			 "zone (%d)", err);
 		goto failed_mount4;
 	}
 
@@ -3040,9 +3040,11 @@ no_journal:
 	} else
 		descr = "out journal";
 
-	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+		"Opts: %s", descr, orig_data);
 
 	lock_kernel();
+	kfree(orig_data);
 	return 0;
 
 cantfind_ext4:
@@ -3059,6 +3061,10 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
 	if (sbi->s_flex_groups) {
 		if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3072,6 @@ failed_mount3:
 		else
 			kfree(sbi->s_flex_groups);
 	}
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3091,7 @@ out_fail:
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
+	kfree(orig_data);
 	return ret;
 }
 
@@ -3380,7 +3383,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	if (!(sb->s_flags & MS_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_kbytes_written =
-		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3488,10 @@ int ext4_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal)
+	if (journal) {
+		vfs_check_frozen(sb, SB_FREEZE_WRITE);
 		ret = ext4_journal_force_commit(journal);
+	}
 
 	return ret;
 }
@@ -3535,18 +3540,16 @@ static int ext4_freeze(struct super_block *sb)
 	 * the journal.
 	 */
 	error = jbd2_journal_flush(journal);
-	if (error < 0) {
-	out:
-		jbd2_journal_unlock_updates(journal);
-		return error;
-	}
+	if (error < 0)
+		goto out;
 
 	/* Journal blocked and flushed, clear needs_recovery flag. */
 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	error = ext4_commit_super(sb, 1);
-	if (error)
-		goto out;
-	return 0;
+out:
+	/* we rely on s_frozen to stop further updates */
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	return error;
 }
 
 /*
@@ -3563,7 +3566,6 @@ static int ext4_unfreeze(struct super_block *sb)
 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	ext4_commit_super(sb, 1);
 	unlock_super(sb);
-	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return 0;
 }
 
@@ -3580,6 +3582,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 
 	lock_kernel();
 
@@ -3713,6 +3716,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+
+	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	kfree(orig_data);
 	return 0;
 
 restore_opts:
@@ -3734,6 +3740,7 @@ restore_opts:
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+	kfree(orig_data);
 	return err;
 }
 
@@ -4141,6 +4148,7 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_check_flag_values();
 	err = init_ext4_system_zone();
 	if (err)
 		return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be..ed9354aff27 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e951508..04338009793 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
 bad_block:
-		ext4_error(inode->i_sb,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-		ext4_error(inode->i_sb,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext4_xattr_check_block(bs->bh)) {
-			ext4_error(sb, "inode %lu: bad block %llu",
-				   inode->i_ino, EXT4_I(inode)->i_file_acl);
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -820,7 +818,7 @@ inserted:
 						EXT4_I(inode)->i_block_group);
 
 			/* non-extent files can't have physical blocks past 2^32 */
-			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
 			block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
 			if (error)
 				goto cleanup;
 
-			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
 
 			ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
 	goto cleanup;
 
 bad_block:
-	ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-		   inode->i_ino, EXT4_I(inode)->i_file_acl);
+	EXT4_ERROR_INODE(inode, "bad block %llu",
+			 EXT4_I(inode)->i_file_acl);
 	goto cleanup;
 
 #undef header
@@ -1194,8 +1192,8 @@ retry:
 		if (!bh)
 			goto cleanup;
 		if (ext4_xattr_check_block(bh)) {
-			ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-				   inode->i_ino, EXT4_I(inode)->i_file_acl);
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
-		ext4_error(inode->i_sb, "inode %lu: block %llu read error",
-			   inode->i_ino, EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "block %llu read error",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-			   inode->i_ino, EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
 		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
-			ext4_error(inode->i_sb,
-				"inode %lu: block %lu read error",
-				inode->i_ino, (unsigned long) ce->e_block);
+			EXT4_ERROR_INODE(inode, "block %lu read error",
+					 (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT4_XATTR_REFCOUNT_MAX) {
 			ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565..ae8200f84e3 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 	while (*fclus < cluster) {
 		/* prevent the infinite loop of cluster chain */
 		if (*fclus > limit) {
-			fat_fs_error(sb, "%s: detected the cluster chain loop"
-				     " (i_pos %lld)", __func__,
-				     MSDOS_I(inode)->i_pos);
+			fat_fs_error_ratelimit(sb,
+					"%s: detected the cluster chain loop"
+					" (i_pos %lld)", __func__,
+					MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
 		}
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 		if (nr < 0)
 			goto out;
 		else if (nr == FAT_ENT_FREE) {
-			fat_fs_error(sb, "%s: invalid cluster chain"
-				     " (i_pos %lld)", __func__,
-				     MSDOS_I(inode)->i_pos);
+			fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
+					       " (i_pos %lld)", __func__,
+					       MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
 		} else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca0151..ee42b9e0b16 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include <linux/kernel.h>
 #include "fat.h"
 
 /*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 {
 	const wchar_t *ip;
 	wchar_t ec;
-	unsigned char *op, nc;
+	unsigned char *op;
 	int charlen;
-	int k;
 
 	ip = uni;
 	op = ascii;
 
 	while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
 		ec = *ip++;
-		if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+		if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
 			op += charlen;
 			len -= charlen;
 		} else {
 			if (uni_xlate == 1) {
-				*op = ':';
-				for (k = 4; k > 0; k--) {
-					nc = ec & 0xF;
-					op[k] = nc > 9	? nc + ('a' - 10)
-							: nc + '0';
-					ec >>= 4;
-				}
-				op += 5;
+				*op++ = ':';
+				op = pack_hex_byte(op, ec >> 8);
+				op = pack_hex_byte(op, ec);
 				len -= 5;
 			} else {
 				*op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 	return ret;
 }
 
-static int fat_dir_ioctl(struct inode *inode, struct file *filp,
-			 unsigned int cmd, unsigned long arg)
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
 	int short_only, both;
 
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
 		both = 1;
 		break;
 	default:
-		return fat_generic_ioctl(inode, filp, cmd, arg);
+		return fat_generic_ioctl(filp, cmd, arg);
 	}
 
 	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 		both = 1;
 		break;
 	default:
-		return -ENOIOCTLCMD;
+		return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
 	}
 
 	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
-	.ioctl		= fat_dir_ioctl,
+	.unlocked_ioctl	= fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6d..53dba57b49a 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
 #include <linux/nls.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
 
 /*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
 	struct fatent_operations *fatent_ops;
 	struct inode *fat_inode;
 
+	struct ratelimit_state ratelimit;
+
 	spinlock_t inode_hash_lock;
 	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
 };
@@ -298,8 +301,8 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
 
 /* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg);
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
+			      unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3))) __cold;
+extern void
+__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(s, fmt, args...)		\
+	__fat_fs_error(s, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(s, fmt, args...) \
+	__fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236..a14c2f6a489 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
 	return err;
 }
 
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
-		      unsigned int cmd, unsigned long arg)
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	u32 __user *user_attr = (u32 __user *)arg;
 
 	switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	}
 }
 
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+				      unsigned long arg)
+
+{
+	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
 	if ((filp->f_mode & FMODE_WRITE) &&
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
-	.ioctl		= fat_generic_ioctl,
+	.unlocked_ioctl	= fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_generic_compat_ioctl,
+#endif
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d5..ed33904926e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
 	sbi->dir_ops = fs_dir_inode_ops;
+	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+			     DEFAULT_RATELIMIT_BURST);
 
 	error = parse_options(data, isvfat, silent, &debug, &sbi->options);
 	if (error)
@@ -1497,10 +1499,8 @@ out_fail:
 		iput(fat_inode);
 	if (root_inode)
 		iput(root_inode);
-	if (sbi->nls_io)
-		unload_nls(sbi->nls_io);
-	if (sbi->nls_disk)
-		unload_nls(sbi->nls_disk);
+	unload_nls(sbi->nls_io);
+	unload_nls(sbi->nls_disk);
 	if (sbi->options.iocharset != fat_default_iocharset)
 		kfree(sbi->options.iocharset);
 	sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f2646..1fa23f6ffba 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
  * In case the file system is remounted read-only, it can be made writable
  * again by remounting it.
  */
-void fat_fs_error(struct super_block *s, const char *fmt, ...)
+void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
 {
 	struct fat_mount_options *opts = &MSDOS_SB(s)->options;
 	va_list args;
 
-	printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
+	if (report) {
+		printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
 
-	printk(KERN_ERR "    ");
-	va_start(args, fmt);
-	vprintk(fmt, args);
-	va_end(args);
-	printk("\n");
+		printk(KERN_ERR "    ");
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
+		printk("\n");
+	}
 
 	if (opts->errors == FAT_ERRORS_PANIC)
-		panic("    FAT fs panic from previous error\n");
+		panic("FAT: fs panic from previous error\n");
 	else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
 		s->s_flags |= MS_RDONLY;
-		printk(KERN_ERR "    File system has been set read-only\n");
+		printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
 	}
 }
-EXPORT_SYMBOL_GPL(fat_fs_error);
+EXPORT_SYMBOL_GPL(__fat_fs_error);
 
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f8..0ec7bb2c95c 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
 };
 
 const struct file_operations vxfs_dir_operations = {
+	.llseek =		generic_file_llseek,
+	.read =			generic_read_dir,
 	.readdir =		vxfs_readdir,
 };
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c4161f1fd9..ea8592b9069 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -409,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
 	wait_queue_head_t *wqh;
 
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-	do {
+	 while (inode->i_state & I_SYNC) {
 		spin_unlock(&inode_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
 		spin_lock(&inode_lock);
-	} while (inode->i_state & I_SYNC);
+	}
 }
 
 /*
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70..4a8eb31c533 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
 	/* banners (can't represent line 0 by pos 0 as that would involve
 	 * returning a NULL pointer) */
 	if (pos == 0)
-		return (struct fscache_object *) ++(*_pos);
+		return (struct fscache_object *)(long)++(*_pos);
 	if (pos < 3)
 		return (struct fscache_object *)pos;
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691..e53df5ebb2b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 9fb76b0a048..48171f4c943 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
 				 void *buffer, size_t size, int xtype)
 {
 	struct inode *inode = dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct posix_acl *acl;
 	int type;
 	int error;
 
+	if (!sdp->sd_args.ar_posix_acl)
+		return -EOPNOTSUPP;
+
 	type = gfs2_acl_type(name);
 	if (type < 0)
 		return type;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f8..b20bfcc9fa2 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_drop_write;
 
+	error = -EACCES;
+	if (!is_owner_or_cap(inode))
+		goto out;
+
+	error = 0;
 	flags = ip->i_diskflags;
 	new_flags = (flags & ~mask) | (reqflags & mask);
 	if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	u32 fsflags, gfsflags;
+
 	if (get_user(fsflags, ptr))
 		return -EFAULT;
+
 	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
 	if (!S_ISDIR(inode->i_mode)) {
 		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 51d8061fa07..b5612cbb62a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -242,34 +242,38 @@ fail:
 }
 
 /**
- * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
  * @sb: The super block
  * no_addr: The inode number
- * @@inode: A pointer to the inode found, if any
  *
- * Returns: 0 and *inode if no errors occurred.  If an error occurs,
- *          the resulting *inode may or may not be NULL.
  */
 
-int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
-			       struct inode **inode)
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
 	struct gfs2_sbd *sdp;
 	struct gfs2_inode *ip;
 	struct gfs2_glock *io_gl;
 	int error;
 	struct gfs2_holder gh;
+	struct inode *inode;
 
-	*inode = gfs2_iget_skip(sb, no_addr);
+	inode = gfs2_iget_skip(sb, no_addr);
 
-	if (!(*inode))
-		return -ENOBUFS;
+	if (!inode)
+		return;
 
-	if (!((*inode)->i_state & I_NEW))
-		return -ENOBUFS;
+	/* If it's not a new inode, someone's using it, so leave it alone. */
+	if (!(inode->i_state & I_NEW)) {
+		iput(inode);
+		return;
+	}
 
-	ip = GFS2_I(*inode);
-	sdp = GFS2_SB(*inode);
+	ip = GFS2_I(inode);
+	sdp = GFS2_SB(inode);
 	ip->i_no_formal_ino = -1;
 
 	error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
 	set_bit(GIF_INVALID, &ip->i_flags);
 	error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
 				   &ip->i_iopen_gh);
-	if (unlikely(error)) {
-		if (error == GLR_TRYFAILED)
-			error = 0;
+	if (unlikely(error))
 		goto fail_iopen;
-	}
+
 	ip->i_iopen_gh.gh_gl->gl_object = ip;
 	gfs2_glock_put(io_gl);
 
-	(*inode)->i_mode = DT2IF(DT_UNKNOWN);
+	inode->i_mode = DT2IF(DT_UNKNOWN);
 
 	/*
 	 * We must read the inode in order to work out its type in
@@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
 	 */
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
 				   &gh);
-	if (unlikely(error)) {
-		if (error == GLR_TRYFAILED)
-			error = 0;
+	if (unlikely(error))
 		goto fail_glock;
-	}
+
 	/* Inode is now uptodate */
 	gfs2_glock_dq_uninit(&gh);
-	gfs2_set_iop(*inode);
+	gfs2_set_iop(inode);
+
+	/* The iput will cause it to be deleted. */
+	iput(inode);
+	return;
 
-	return 0;
 fail_glock:
 	gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
@@ -321,7 +324,8 @@ fail_put:
 	ip->i_gl->gl_object = NULL;
 	gfs2_glock_put(ip->i_gl);
 fail:
-	return error;
+	iget_failed(inode);
+	return;
 }
 
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index e161461d4c5..300ada3f21d 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
 				       u64 no_addr, u64 no_formal_ino);
-extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
-				      struct inode **inode);
+extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b593f0e28f2..6a857e24f94 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
  *
  */
 
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
 	struct gfs2_ail *ai;
 
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eb570b4ad44..0d007f92023 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 	sdp->sd_log_head = sdp->sd_log_tail = value;
 }
 
-unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 			    unsigned int ssize);
 
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
 
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 				      struct buffer_head *real);
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 
-static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
-{
-	if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
-		__gfs2_log_flush(sbd, gl);
-}
-
-void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-
-void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-int gfs2_logd(void *data);
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+extern int gfs2_logd(void *data);
 
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 117fa4171f6..171a744f8e4 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1192,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct inode *inode;
 	int error = 0;
 	u64 last_unlinked = NO_BLOCK, unlinked;
 
@@ -1210,22 +1209,27 @@ try_again:
 	if (error)
 		return error;
 
+	/* Find an rgrp suitable for allocation.  If it encounters any unlinked
+	   dinodes along the way, error will equal -EAGAIN and unlinked will
+	   contains it block address. We then need to look up that inode and
+	   try to free it, and try the allocation again. */
 	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
 	if (error) {
 		if (ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
 		if (error != -EAGAIN)
 			return error;
-		error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb,
-						   unlinked, &inode);
-		if (inode)
-			iput(inode);
+
+		gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+		/* regardless of whether or not gfs2_process_unlinked_inode
+		   was successful, we don't want to repeat it again. */
+		last_unlinked = unlinked;
 		gfs2_log_flush(sdp, NULL);
-		if (error == GLR_TRYFAILED)
-			error = 0;
+		error = 0;
+
 		goto try_again;
 	}
-
+	/* no error, so we have the rgrp set in the inode's allocation. */
 	al->al_file = file;
 	al->al_line = line;
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f402367825..764fd1bdca8 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= hfsplus_readdir,
-	.ioctl          = hfsplus_ioctl,
+	.unlocked_ioctl = hfsplus_ioctl,
 	.llseek		= generic_file_llseek,
 	.release	= hfsplus_dir_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9d..6505c30ad96 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
 
 /* ioctl.c */
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		  unsigned long arg);
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 int hfsplus_setxattr(struct dentry *dentry, const char *name,
 		     const void *value, size_t size, int flags);
 ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c056..9bbb82924a2 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
 	.fsync		= file_fsync,
 	.open		= hfsplus_file_open,
 	.release	= hfsplus_file_release,
-	.ioctl          = hfsplus_ioctl,
+	.unlocked_ioctl = hfsplus_ioctl,
 };
 
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51a..ac405f09902 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		  unsigned long arg)
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	unsigned int flags;
 
+	lock_kernel();
 	switch (cmd) {
 	case HFSPLUS_IOC_EXT2_GETFLAGS:
 		flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	case HFSPLUS_IOC_EXT2_SETFLAGS: {
 		int err = 0;
 		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
+		if (err) {
+			unlock_kernel();
 			return err;
+		}
 
 		if (!is_owner_or_cap(inode)) {
 			err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		mark_inode_dirty(inode);
 setflags_out:
 		mnt_drop_write(filp->f_path.mnt);
+		unlock_kernel();
 		return err;
 	}
 	default:
+		unlock_kernel();
 		return -ENOTTY;
 	}
 }
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a48..e0aca9a0ac6 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
 
 const struct file_operations isofs_dir_operations =
 {
+	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
 	.readdir = isofs_readdir,
 };
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900..e214d68620a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_sync)
 		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
-	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
 	transaction->t_outstanding_credits -= handle->h_buffer_credits;
 	transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
-		__jbd2_log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
+		jbd2_log_start_commit(journal, transaction->t_tid);
 
 		/*
 		 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
 			err = jbd2_log_wait_commit(journal, tid);
 	} else {
 		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
 	}
 
 	lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e5..9578cbe0cd5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
 		      
 const struct file_operations ncp_dir_operations =
 {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ncp_readdir,
-	.ioctl		= ncp_ioctl,
+	.unlocked_ioctl	= ncp_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a..b9387089289 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
 	.llseek 	= ncp_remote_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
-	.ioctl		= ncp_ioctl,
+	.unlocked_ioctl	= ncp_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea..023c03d0207 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 #include <linux/ncp_fs.h>
 
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
 
-static int __ncp_ioctl(struct inode *inode, struct file *filp,
-	      unsigned int cmd, unsigned long arg)
+static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(inode);
 	int result;
 	struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
 	}
 }
 
-int ncp_ioctl(struct inode *inode, struct file *filp,
-	      unsigned int cmd, unsigned long arg)
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	int ret;
+	long ret;
 
+	lock_kernel();
 	if (ncp_ioctl_need_write(cmd)) {
 		/*
 		 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		 * -EACCESS, so it seems consistent to keep
 		 *  that here.
 		 */
-		if (mnt_want_write(filp->f_path.mnt))
-			return -EACCES;
+		if (mnt_want_write(filp->f_path.mnt)) {
+			ret = -EACCES;
+			goto out;
+		}
 	}
-	ret = __ncp_ioctl(inode, filp, cmd, arg);
+	ret = __ncp_ioctl(filp, cmd, arg);
 	if (ncp_ioctl_need_write(cmd))
 		mnt_drop_write(filp->f_path.mnt);
+
+out:
+	unlock_kernel();
 	return ret;
 }
 
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
+	long ret;
 
 	lock_kernel();
 	arg = (unsigned long) compat_ptr(arg);
-	ret = ncp_ioctl(inode, file, cmd, arg);
+	ret = ncp_ioctl(file, cmd, arg);
 	unlock_kernel();
 	return ret;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf..db64854b7b0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1741,6 +1741,7 @@ remove_lru_entry:
 			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
 			smp_mb__after_clear_bit();
 		}
+		spin_unlock(&inode->i_lock);
 	}
 	spin_unlock(&nfs_access_lru_lock);
 	nfs_access_free_list(&head);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa..04214fc5c30 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			rc = strict_strtoul(string, 10, &option);
 			kfree(string);
-			if (rc != 0 || option > USHORT_MAX)
+			if (rc != 0 || option > USHRT_MAX)
 				goto out_invalid_value;
 			mnt->nfs_server.port = option;
 			break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			rc = strict_strtoul(string, 10, &option);
 			kfree(string);
-			if (rc != 0 || option > USHORT_MAX)
+			if (rc != 0 || option > USHRT_MAX)
 				goto out_invalid_value;
 			mnt->mount_server.port = option;
 			break;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab..91679e2631e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
 	int res = 0;
 
 	if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
-		goto out;
+		goto out_mark_dirty;
 	spin_lock(&inode->i_lock);
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
 			wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
 					nfs_wait_bit_killable,
 					TASK_KILLABLE);
+		else
+			goto out_mark_dirty;
 	} else
 		nfs_commit_clear_lock(NFS_I(inode));
-out:
+	return res;
+	/* Note: If we exit without ensuring that the commit is complete,
+	 * we must mark the inode as dirty. Otherwise, future calls to
+	 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+	 * that the data is on the disk.
+	 */
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return res;
 }
 
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
 	};
 	int ret;
 
-	while(PagePrivate(page)) {
+	for (;;) {
 		wait_on_page_writeback(page);
 		if (clear_page_dirty_for_io(page)) {
 			ret = nfs_writepage_locked(page, &wbc);
 			if (ret < 0)
 				goto out_error;
+			continue;
 		}
-		ret = sync_inode(inode, &wbc);
+		if (!PagePrivate(page))
+			break;
+		ret = nfs_commit_inode(inode, FLUSH_SYNC);
 		if (ret < 0)
 			goto out_error;
 	}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f..508941c23af 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
 	if (sscanf(buf, "%15s %4u", transport, &port) != 2)
 		return -EINVAL;
 
-	if (port < 1 || port > USHORT_MAX)
+	if (port < 1 || port > USHRT_MAX)
 		return -EINVAL;
 
 	err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
 	if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
 		return -EINVAL;
 
-	if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+	if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
 		return -EINVAL;
 
 	xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba7..a1924a0d2ab 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * the page at all.  For a more detailed explanation see ntfs_truncate() in
  * fs/ntfs/inode.c.
  *
- * @cached_page and @lru_pvec are just optimizations for dealing with multiple
- * pages.
- *
  * Return 0 on success and -errno on error.  In the case that an error is
  * encountered it is possible that the initialized size will already have been
  * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
  *	    held by the caller.
  */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
-		struct page **cached_page, struct pagevec *lru_pvec)
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
 {
 	s64 old_init_size;
 	loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
  * Obtain @nr_pages locked page cache pages from the mapping @mapping and
  * starting at index @index.
  *
- * If a page is newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec @lru_pvec.
- *
- * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
- * are obtained at once instead of just one page and that 0 is returned on
- * success and -errno on error.
+ * If a page is newly created, add it to lru list
  *
  * Note, the page locks are obtained in ascending page index order.
  */
 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 		pgoff_t index, const unsigned nr_pages, struct page **pages,
-		struct page **cached_page, struct pagevec *lru_pvec)
+		struct page **cached_page)
 {
 	int err, nr;
 
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 					goto err_out;
 				}
 			}
-			err = add_to_page_cache(*cached_page, mapping, index,
+			err = add_to_page_cache_lru(*cached_page, mapping, index,
 					GFP_KERNEL);
 			if (unlikely(err)) {
 				if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 				goto err_out;
 			}
 			pages[nr] = *cached_page;
-			page_cache_get(*cached_page);
-			if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-				__pagevec_lru_add_file(lru_pvec);
 			*cached_page = NULL;
 		}
 		index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 	ssize_t status, written;
 	unsigned nr_pages;
 	int err;
-	struct pagevec lru_pvec;
 
 	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
 			"pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 			}
 		}
 	}
-	pagevec_init(&lru_pvec, 0);
 	written = 0;
 	/*
 	 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 	ll = ni->initialized_size;
 	read_unlock_irqrestore(&ni->size_lock, flags);
 	if (pos > ll) {
-		err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
-				&lru_pvec);
+		err = ntfs_attr_extend_initialized(ni, pos);
 		if (err < 0) {
 			ntfs_error(vol->sb, "Cannot perform write to inode "
 					"0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 			ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
 		/* Get and lock @do_pages starting at index @start_idx. */
 		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-				pages, &cached_page, &lru_pvec);
+				pages, &cached_page);
 		if (unlikely(status))
 			break;
 		/*
@@ -2077,7 +2062,6 @@ err_out:
 	*ppos = pos;
 	if (cached_page)
 		page_cache_release(cached_page);
-	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3..ec6d1233959 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
 	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
 	 * larger than 16 bits.
 	 */
-	BUG_ON(ecc > USHORT_MAX);
+	BUG_ON(ecc > USHRT_MAX);
 
 	bc->bc_crc32e = cpu_to_le32(crc);
 	bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
 	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
 	 * larger than 16 bits.
 	 */
-	BUG_ON(ecc > USHORT_MAX);
+	BUG_ON(ecc > USHRT_MAX);
 
 	bc->bc_crc32e = cpu_to_le32(crc);
 	bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 3ceca05b668..648c9d8f335 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include "ldm.h"
 #include "check.h"
 #include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
 	int h;
 
 	/* high part */
-	if      ((x = src[0] - '0') <= '9'-'0') h = x;
-	else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10;
-	else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10;
-	else return -1;
-	h <<= 4;
+	x = h = hex_to_bin(src[0]);
+	if (h < 0)
+		return -1;
 
 	/* low part */
-	if ((x = src[1] - '0') <= '9'-'0') return h | x;
-	if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10);
-	if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10);
-	return -1;
+	h = hex_to_bin(src[1]);
+	if (h < 0)
+		return -1;
+
+	return (x << 4) + h;
 }
 
 /**
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac..9b58d38bc91 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		shpending = p->signal->shared_pending.signal;
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
-		num_threads = atomic_read(&p->signal->count);
+		num_threads = get_nr_threads(p);
 		rcu_read_lock();  /* FIXME: is this correct? */
 		qsize = atomic_read(&__task_cred(p)->user->sigpending);
 		rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			tty_nr = new_encode_dev(tty_devnum(sig->tty));
 		}
 
-		num_threads = atomic_read(&sig->count);
+		num_threads = get_nr_threads(task);
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 
 		cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449d..acb7ef80ea4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
 	return result;
 }
 
-static int get_nr_threads(struct task_struct *tsk)
-{
-	unsigned long flags;
-	int count = 0;
-
-	if (lock_task_sighand(tsk, &flags)) {
-		count = atomic_read(&tsk->signal->count);
-		unlock_task_sighand(tsk, &flags);
-	}
-	return count;
-}
-
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-EINVAL);
+	struct dentry *error;
 
 	/* Allocate the inode */
 	error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
-	struct dentry *result = ERR_PTR(-ENOENT);
+	struct dentry *result;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c12749060..2791907744e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000		reserved
- * 00000001-00000fff	static entries	(goners)
- *      001		root-ino
- *
- * 00001000-00001fff	unused
- * 0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
- * 80000000-efffffff	unused
- * f0000000-ffffffff	dynamic entries
- *
- * Goal:
- *	Once we split the thing into several virtual filesystems,
- *	we will get rid of magical ranges (and this comment, BTW).
  */
 static unsigned int get_inode_number(void)
 {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351b..6f37c391468 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
  */
 static void __init proc_kcore_text_init(void)
 {
-	kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+	kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 }
 #else
 static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a6..4258384ed22 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
 	if (err)
 		return;
 	proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-	err = PTR_ERR(proc_mnt);
 	if (IS_ERR(proc_mnt)) {
 		unregister_filesystem(&proc_fs_type);
 		return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56..aea1d3f1ffb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
 static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 {
 	u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 
 	return err;
 }
+#endif /* HUGETLB_PAGE */
 
 /*
  * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	pagemap_walk.pmd_entry = pagemap_pte_range;
 	pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
 	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
 	pagemap_walk.mm = mm;
 	pagemap_walk.private = &pm;
 
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcb..3d3fd469213 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,6 +77,7 @@ out:
 
 const struct file_operations qnx4_dir_operations =
 {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
 	.fsync		= simple_fsync,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c..1ad8bf076cf 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1514,11 +1514,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 /*
  * This operation can block, but only after everything is updated
  */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
-		int warn, int reserve)
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
 	int cnt, ret = 0;
 	char warntype[MAXQUOTAS];
+	int warn = flags & DQUOT_SPACE_WARN;
+	int reserve = flags & DQUOT_SPACE_RESERVE;
+	int nofail = flags & DQUOT_SPACE_NOFAIL;
 
 	/*
 	 * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1541,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 			continue;
 		ret = check_bdq(inode->i_dquot[cnt], number, !warn,
 				warntype+cnt);
-		if (ret) {
+		if (ret && !nofail) {
 			spin_unlock(&dq_data_lock);
 			goto out_flush_warn;
 		}
@@ -1638,10 +1640,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
  * This operation can block, but only after everything is updated
  */
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
+	int reserve = flags & DQUOT_SPACE_RESERVE;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2..9c0485236e6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+	return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
+
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
 	return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a95..4455fbe269a 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -18,6 +18,7 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 			      int datasync);
 
 const struct file_operations reiserfs_dir_operations = {
+	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
 	.readdir = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427..00a70cab1f3 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
 
 const struct file_operations smb_dir_operations =
 {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= smb_readdir,
-	.ioctl		= smb_ioctl,
+	.unlocked_ioctl	= smb_ioctl,
 	.open		= smb_dir_open,
 };
 
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf0..84ecf0e43f9 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -437,7 +437,7 @@ const struct file_operations smb_file_operations =
 	.aio_read	= smb_file_aio_read,
 	.write		= do_sync_write,
 	.aio_write	= smb_file_aio_write,
-	.ioctl		= smb_ioctl,
+	.unlocked_ioctl	= smb_ioctl,
 	.mmap		= smb_file_mmap,
 	.open		= smb_file_open,
 	.release	= smb_file_release,
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26..07215312ad3 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/net.h>
 
 #include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
 
 #include "proto.h"
 
-int
-smb_ioctl(struct inode *inode, struct file *filp,
-	  unsigned int cmd, unsigned long arg)
+long
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct smb_sb_info *server = server_from_inode(inode);
+	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
 	struct smb_conn_opt opt;
 	int result = -EINVAL;
 
+	lock_kernel();
 	switch (cmd) {
 		uid16_t uid16;
 		uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
 	default:
 		break;
 	}
+	unlock_kernel();
 
 	return result;
 }
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d..05939a6f43e 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern const struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
-extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 /* smbiod.c */
 extern void smbiod_wake_up(void);
 extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046..00b2909bd46 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
-#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d68..cc6ce8a84c2 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
 
 	  If unsure, say N.
 
+config SQUASHFS_XATTRS
+	bool "Squashfs XATTR support"
+	depends on SQUASHFS
+	default n
+	help
+	  Saying Y here includes support for extended attributes (xattrs).
+	  Xattrs are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page).
+
+	  If unsure, say N.
+
 config SQUASHFS_EMBEDDED
 
 	bool "Additional option for memory-constrained systems" 
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870..2cee3e9fa45 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
+
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e4..62e63ad2507 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
 
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 /*
  * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 	int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
 	union squashfs_inode squashfs_ino;
 	struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+	int xattr_id = SQUASHFS_INVALID_XATTR;
 
 	TRACE("Entered squashfs_read_inode\n");
 
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			frag_offset = 0;
 		}
 
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
 		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
 		inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_inode_ops;
 		inode->i_fop = &generic_ro_fops;
 		inode->i_mode |= S_IFREG;
 		inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		if (err < 0)
 			goto failed_read;
 
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
 		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
 		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
 		inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 
 		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
 		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
-		inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &squashfs_symlink_inode_ops;
 		inode->i_data.a_ops = &squashfs_symlink_aops;
 		inode->i_mode |= S_IFLNK;
 		squashfs_i(inode)->start = block;
 		squashfs_i(inode)->offset = offset;
 
+		if (type == SQUASHFS_LSYMLINK_TYPE) {
+			__le32 xattr;
+
+			err = squashfs_read_metadata(sb, NULL, &block,
+						&offset, inode->i_size);
+			if (err < 0)
+				goto failed_read;
+			err = squashfs_read_metadata(sb, &xattr, &block,
+						&offset, sizeof(xattr));
+			if (err < 0)
+				goto failed_read;
+			xattr_id = le32_to_cpu(xattr);
+		}
+
 		TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
 				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
 				block, offset);
 		break;
 	}
 	case SQUASHFS_BLKDEV_TYPE:
-	case SQUASHFS_CHRDEV_TYPE:
-	case SQUASHFS_LBLKDEV_TYPE:
-	case SQUASHFS_LCHRDEV_TYPE: {
+	case SQUASHFS_CHRDEV_TYPE: {
 		struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
 		unsigned int rdev;
 
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 				SQUASHFS_INODE_BLK(ino), offset, rdev);
 		break;
 	}
+	case SQUASHFS_LBLKDEV_TYPE:
+	case SQUASHFS_LCHRDEV_TYPE: {
+		struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LCHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
 	case SQUASHFS_FIFO_TYPE:
-	case SQUASHFS_SOCKET_TYPE:
-	case SQUASHFS_LFIFO_TYPE:
-	case SQUASHFS_LSOCKET_TYPE: {
+	case SQUASHFS_SOCKET_TYPE: {
 		struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
 
 		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		init_special_inode(inode, inode->i_mode, 0);
 		break;
 	}
+	case SQUASHFS_LFIFO_TYPE:
+	case SQUASHFS_LSOCKET_TYPE: {
+		struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LFIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
 	default:
 		ERROR("Unknown inode type %d in squashfs_iget!\n", type);
 		return -EINVAL;
 	}
 
+	if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+		err = squashfs_xattr_lookup(sb, xattr_id,
+					&squashfs_i(inode)->xattr_count,
+					&squashfs_i(inode)->xattr_size,
+					&squashfs_i(inode)->xattr);
+		if (err < 0)
+			goto failed_read;
+		inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+				+ 1;
+	} else
+		squashfs_i(inode)->xattr_count = 0;
+
 	return 0;
 
 failed_read:
 	ERROR("Unable to read inode 0x%llx\n", ino);
 	return err;
 }
+
+
+const struct inode_operations squashfs_inode_ops = {
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad93..7a9464d08cf 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 /*
  * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
 
 
 const struct inode_operations squashfs_dir_inode_ops = {
-	.lookup = squashfs_lookup
+	.lookup = squashfs_lookup,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af551..733a17c4294 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
 				unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
 
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
+
 /*
- * Inodes, files and decompressor operations
+ * Inodes, files,  decompressor and xattr operations
  */
 
 /* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
 /* file.c */
 extern const struct address_space_operations squashfs_aops;
 
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
+
 /* namei.c */
 extern const struct inode_operations squashfs_dir_inode_ops;
 
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
 
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea0..8eabb808b78 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
 #define SQUASHFS_NAME_LEN		256
 
 #define SQUASHFS_INVALID_FRAG		(0xffffffffU)
+#define SQUASHFS_INVALID_XATTR		(0xffffffffU)
 #define SQUASHFS_INVALID_BLK		(-1LL)
 
 /* Filesystem flags */
@@ -96,6 +97,13 @@
 #define SQUASHFS_LFIFO_TYPE		13
 #define SQUASHFS_LSOCKET_TYPE		14
 
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
+
 /* Flag whether block is compressed or uncompressed, bit is set if block is
  * uncompressed */
 #define SQUASHFS_COMPRESSED_BIT		(1 << 15)
@@ -174,6 +182,24 @@
 
 #define SQUASHFS_ID_BLOCK_BYTES(A)	(SQUASHFS_ID_BLOCKS(A) *\
 					sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)		((A) * sizeof(struct squashfs_xattr_id))
+
+#define SQUASHFS_XATTR_BLOCK(A)		(SQUASHFS_XATTR_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)	(SQUASHFS_XATTR_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCKS(A)	((SQUASHFS_XATTR_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)	(SQUASHFS_XATTR_BLOCKS(A) *\
+					sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_XATTR_OFFSET(A)	((unsigned int) ((A) & 0xffff))
 
 /* cached data constants for filesystem */
 #define SQUASHFS_CACHED_BLKS		8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
 	__le64			root_inode;
 	__le64			bytes_used;
 	__le64			id_table_start;
-	__le64			xattr_table_start;
+	__le64			xattr_id_table_start;
 	__le64			inode_table_start;
 	__le64			directory_table_start;
 	__le64			fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
 	__le32			nlink;
 };
 
+struct squashfs_lipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			xattr;
+};
+
 struct squashfs_dev_inode {
 	__le16			inode_type;
 	__le16			mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
 	__le32			rdev;
 };
 
+struct squashfs_ldev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			rdev;
+	__le32			xattr;
+};
+
 struct squashfs_symlink_inode {
 	__le16			inode_type;
 	__le16			mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
 union squashfs_inode {
 	struct squashfs_base_inode		base;
 	struct squashfs_dev_inode		dev;
+	struct squashfs_ldev_inode		ldev;
 	struct squashfs_symlink_inode		symlink;
 	struct squashfs_reg_inode		reg;
 	struct squashfs_lreg_inode		lreg;
 	struct squashfs_dir_inode		dir;
 	struct squashfs_ldir_inode		ldir;
 	struct squashfs_ipc_inode		ipc;
+	struct squashfs_lipc_inode		lipc;
 };
 
 struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
 	unsigned int		unused;
 };
 
+struct squashfs_xattr_entry {
+	__le16			type;
+	__le16			size;
+	char			data[0];
+};
+
+struct squashfs_xattr_val {
+	__le32			vsize;
+	char			value[0];
+};
+
+struct squashfs_xattr_id {
+	__le64			xattr;
+	__le32			count;
+	__le32			size;
+};
+
+struct squashfs_xattr_id_table {
+	__le64			xattr_table_start;
+	__le32			xattr_ids;
+	__le32			unused;
+};
+
 #endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c6..d3e3a37f28a 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
 struct squashfs_inode_info {
 	u64		start;
 	int		offset;
+	u64		xattr;
+	unsigned int	xattr_size;
+	int		xattr_count;
 	union {
 		struct {
 			u64		fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e2..d9037a5215f 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
 	int					next_meta_index;
 	__le64					*id_table;
 	__le64					*fragment_index;
+	__le64					*xattr_id_table;
 	struct mutex				read_data_mutex;
 	struct mutex				meta_index_mutex;
 	struct meta_index			*meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
 	__le64					*inode_lookup_table;
 	u64					inode_table;
 	u64					directory_table;
+	u64					xattr_table;
 	unsigned int				block_size;
 	unsigned short				block_log;
 	long long				bytes_used;
 	unsigned int				inodes;
+	int					xattr_ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a..88b4f860665 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "xattr.h"
 
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	long long root_inode;
 	unsigned short flags;
 	unsigned int fragments;
-	u64 lookup_table_start;
+	u64 lookup_table_start, xattr_id_table_start;
 	int err;
 
 	TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (msblk->decompressor == NULL)
 		goto failed_mount;
 
-	/*
-	 * Check if there's xattrs in the filesystem.  These are not
-	 * supported in this version, so warn that they will be ignored.
-	 */
-	if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
-		ERROR("Xattrs in filesystem, these will be ignored\n");
-
 	/* Check the filesystem does not extend beyond the end of the
 	   block device */
 	msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 allocate_lookup_table:
 	lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
 	if (lookup_table_start == SQUASHFS_INVALID_BLK)
-		goto allocate_root;
+		goto allocate_xattr_table;
 
 	/* Allocate and read inode lookup table */
 	msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
 
 	sb->s_export_op = &squashfs_export_ops;
 
+allocate_xattr_table:
+	sb->s_xattr = squashfs_xattr_handlers;
+	xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+	if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+		goto allocate_root;
+
+	/* Allocate and read xattr id lookup table */
+	msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+		xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+	if (IS_ERR(msblk->xattr_id_table)) {
+		err = PTR_ERR(msblk->xattr_id_table);
+		msblk->xattr_id_table = NULL;
+		if (err != -ENOTSUPP)
+			goto failed_mount;
+	}
 allocate_root:
 	root = new_inode(sb);
 	if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
 	kfree(msblk->inode_lookup_table);
 	kfree(msblk->fragment_index);
 	kfree(msblk->id_table);
+	kfree(msblk->xattr_id_table);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
 		kfree(sbi->fragment_index);
 		kfree(sbi->meta_index);
 		kfree(sbi->inode_lookup_table);
+		kfree(sbi->xattr_id_table);
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee3..ec86434921e 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -114,3 +116,12 @@ error_out:
 const struct address_space_operations squashfs_symlink_aops = {
 	.readpage = squashfs_symlink_readpage
 };
+
+const struct inode_operations squashfs_symlink_inode_ops = {
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 00000000000..c7655e8b31c
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const struct xattr_handler *squashfs_xattr_handler(int);
+
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+	size_t buffer_size)
+{
+	struct inode *inode = d->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	size_t rest = buffer_size;
+	int err;
+
+	/* check that the file system has xattrs */
+	if (msblk->xattr_id_table == NULL)
+		return -EOPNOTSUPP;
+
+	/* loop reading each xattr name */
+	while (count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		const struct xattr_handler *handler;
+		int name_size, prefix_size = 0;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+		if (handler)
+			prefix_size = handler->list(d, buffer, rest, NULL,
+				name_size, handler->flags);
+		if (prefix_size) {
+			if (buffer) {
+				if (prefix_size + name_size + 1 > rest) {
+					err = -ERANGE;
+					goto failed;
+				}
+				buffer += prefix_size;
+			}
+			err = squashfs_read_metadata(sb, buffer, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+			if (buffer) {
+				buffer[name_size] = '\0';
+				buffer += name_size + 1;
+			}
+			rest -= prefix_size + name_size + 1;
+		} else  {
+			/* no handler or insuffficient privileges, so skip */
+			err = squashfs_read_metadata(sb, NULL, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+		}
+
+
+		/* skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+						sizeof(val));
+		if (err < 0)
+			goto failed;
+
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = buffer_size - rest;
+
+failed:
+	return err;
+}
+
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+	const char *name, void *buffer, size_t buffer_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	int name_len = strlen(name);
+	int err, vsize;
+	char *target = kmalloc(name_len, GFP_KERNEL);
+
+	if (target == NULL)
+		return  -ENOMEM;
+
+	/* loop reading each xattr name */
+	for (; count; count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		int type, prefix, name_size;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		type = le16_to_cpu(entry.type);
+		prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+
+		if (prefix == name_index && name_size == name_len)
+			err = squashfs_read_metadata(sb, target, &start,
+						&offset, name_size);
+		else
+			err = squashfs_read_metadata(sb, NULL, &start,
+						&offset, name_size);
+		if (err < 0)
+			goto failed;
+
+		if (prefix == name_index && name_size == name_len &&
+					strncmp(target, name, name_size) == 0) {
+			/* found xattr */
+			if (type & SQUASHFS_XATTR_VALUE_OOL) {
+				__le64 xattr;
+				/* val is a reference to the real location */
+				err = squashfs_read_metadata(sb, &val, &start,
+						&offset, sizeof(val));
+				if (err < 0)
+					goto failed;
+				err = squashfs_read_metadata(sb, &xattr, &start,
+					 &offset, sizeof(xattr));
+				if (err < 0)
+					goto failed;
+				xattr = le64_to_cpu(xattr);
+				start = SQUASHFS_XATTR_BLK(xattr) +
+							msblk->xattr_table;
+				offset = SQUASHFS_XATTR_OFFSET(xattr);
+			}
+			/* read xattr value */
+			err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+			if (err < 0)
+				goto failed;
+
+			vsize = le32_to_cpu(val.vsize);
+			if (buffer) {
+				if (vsize > buffer_size) {
+					err = -ERANGE;
+					goto failed;
+				}
+				err = squashfs_read_metadata(sb, buffer, &start,
+					 &offset, vsize);
+				if (err < 0)
+					goto failed;
+			}
+			break;
+		}
+
+		/* no match, skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+		if (err < 0)
+			goto failed;
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = count ? vsize : -ENODATA;
+
+failed:
+	kfree(target);
+	return err;
+}
+
+
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+	const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_USER_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+	return XATTR_USER_PREFIX_LEN;
+}
+
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+	size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= squashfs_user_list,
+	.get	= squashfs_user_get
+};
+
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+	return XATTR_TRUSTED_PREFIX_LEN;
+}
+
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= squashfs_trusted_list,
+	.get	= squashfs_trusted_get
+};
+
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+	return XATTR_SECURITY_PREFIX_LEN;
+}
+
+static int squashfs_security_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= squashfs_security_list,
+	.get	= squashfs_security_get
+};
+
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+	if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+		/* ignore unrecognised type */
+		return NULL;
+
+	switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+	case SQUASHFS_XATTR_USER:
+		return &squashfs_xattr_user_handler;
+	case SQUASHFS_XATTR_TRUSTED:
+		return &squashfs_xattr_trusted_handler;
+	case SQUASHFS_XATTR_SECURITY:
+		return &squashfs_xattr_security_handler;
+	default:
+		/* ignore unrecognised type */
+		return NULL;
+	}
+}
+
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+	&squashfs_xattr_user_handler,
+	&squashfs_xattr_trusted_handler,
+	&squashfs_xattr_security_handler,
+	NULL
+};
+
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 00000000000..9da071ae181
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+		u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+		int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+		u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+	ERROR("Xattrs in filesystem, these will be ignored\n");
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+		unsigned int index, int *count, int *size,
+		unsigned long long *xattr)
+{
+	return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 00000000000..cfb41106098
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+		int *count, unsigned int *size, unsigned long long *xattr)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_XATTR_BLOCK(index);
+	int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+	struct squashfs_xattr_id id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+							sizeof(id));
+	if (err < 0)
+		return err;
+
+	*xattr = le64_to_cpu(id.xattr);
+	*size = le32_to_cpu(id.size);
+	*count = le32_to_cpu(id.count);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+		u64 *xattr_table_start, int *xattr_ids)
+{
+	unsigned int len;
+	__le64 *xid_table;
+	struct squashfs_xattr_id_table id_table;
+	int err;
+
+	err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+	if (err < 0) {
+		ERROR("unable to read xattr id table\n");
+		return ERR_PTR(err);
+	}
+	*xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+	*xattr_ids = le32_to_cpu(id_table.xattr_ids);
+	len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+
+	TRACE("In read_xattr_index_table, length %d\n", len);
+
+	/* Allocate xattr id lookup table indexes */
+	xid_table = kmalloc(len, GFP_KERNEL);
+	if (xid_table == NULL) {
+		ERROR("Failed to allocate xattr id index table\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+	if (err < 0) {
+		ERROR("unable to read xattr id index table\n");
+		kfree(xid_table);
+		return ERR_PTR(err);
+	}
+
+	return xid_table;
+}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a7..1660c81ffa3 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,6 +207,7 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
+	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
 	.unlocked_ioctl		= udf_ioctl,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a9..ad9bc1ebd3a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -918,6 +918,7 @@ again:
 	sbi->s_bytesex = BYTESEX_LE;
 	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
 		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
 		case UFS2_MAGIC:
 		case UFS_MAGIC_LFN:
 	        case UFS_MAGIC_FEA:
@@ -927,6 +928,7 @@ again:
 	sbi->s_bytesex = BYTESEX_BE;
 	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
 		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
 		case UFS2_MAGIC:
 		case UFS_MAGIC_LFN:
 	        case UFS_MAGIC_FEA:
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0..8aba544f9fa 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_SECTOR_SIZE 512
 #define UFS_SECTOR_BITS 9
 #define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
 #define UFS2_MAGIC 0x19540119
 #define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
 
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8b..c8fb13f83b3 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_itable.o \
 				   xfs_dfrag.o \
 				   xfs_log.o \
+				   xfs_log_cil.o \
 				   xfs_log_recover.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c4..649ade8ef59 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
 
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
  *	Note that this in no way locks the underlying pages, so it is only
  *	useful for synchronizing concurrent use of buffer objects, not for
  *	synchronizing independent access to the underlying pages.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we
+ *	are being asked to lock a buffer that has been reallocated. Because
+ *	it is pinned, we know that the log has not been pushed to disk and
+ *	hence it will still be locked. Rather than sleeping until someone
+ *	else pushes the log, push it ourselves before trying to get the lock.
  */
 void
 xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
 {
 	trace_xfs_buf_lock(bp, _RET_IP_);
 
+	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_mount, 0);
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
 	down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d..9ac8aea9152 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
 #include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f24dbe5efde..f2d1718c916 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_DMAPI	"dmapi"		/* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_XDSM	"xdsm"		/* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI	"dmi"		/* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG   "delaylog"	/* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog"	/* Delayed loging disabled */
 
 /*
  * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DMAPI;
 		} else if (!strcmp(this_char, MNTOPT_DMI)) {
 			mp->m_flags |= XFS_MOUNT_DMAPI;
+		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+			mp->m_flags |= XFS_MOUNT_DELAYLOG;
+			cmn_err(CE_WARN,
+				"Enabling EXPERIMENTAL delayed logging feature "
+				"- use at your own risk.\n");
+		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, "ihashsize")) {
 			cmn_err(CE_WARN,
 	"XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
 		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
 		{ XFS_MOUNT_DMAPI,		"," MNTOPT_DMAPI },
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
+		{ XFS_MOUNT_DELAYLOG,		"," MNTOPT_DELAYLOG },
 		{ 0, NULL }
 	};
 	static struct proc_xfs_info xfs_info_unset[] = {
@@ -1755,7 +1765,7 @@ xfs_init_zones(void)
 	 * but it is much faster.
 	 */
 	xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-				(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+				(((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
 				  NBWORD) * sizeof(int))), "xfs_buf_item");
 	if (!xfs_buf_item_zone)
 		goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd290..ff6bc797baf 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
 
 );
 
+#define XFS_BUSY_SYNC \
+	{ 0,	"async" }, \
+	{ 1,	"sync" }
+
 TRACE_EVENT(xfs_alloc_busy,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, int slot),
-	TP_ARGS(mp, agno, agbno, len, slot),
+	TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
+	TP_ARGS(trans, agno, agbno, len, sync),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(int, tid)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(int, slot)
+		__field(int, sync)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
+		__entry->dev = trans->t_mountp->m_super->s_dev;
+		__entry->tp = trans;
+		__entry->tid = trans->t_ticket->t_tid;
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->slot = slot;
+		__entry->sync = sync;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+	TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
+		  __entry->tid,
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
-		  __entry->slot)
+		  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
 
 );
 
-#define XFS_BUSY_STATES \
-	{ 0,	"found" }, \
-	{ 1,	"missing" }
-
 TRACE_EVENT(xfs_alloc_unbusy,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int slot, int found),
-	TP_ARGS(mp, agno, slot, found),
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
-		__field(int, slot)
-		__field(int, found)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
-		__entry->slot = slot;
-		__entry->found = found;
+		__entry->agbno = agbno;
+		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno %u slot %d %s",
+	TP_printk("dev %d:%d agno %u agbno %u len %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
-		  __entry->slot,
-		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+		  __entry->agbno,
+		  __entry->len)
 );
 
+#define XFS_BUSY_STATES \
+	{ 0,	"missing" }, \
+	{ 1,	"found" }
+
 TRACE_EVENT(xfs_alloc_busysearch,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, xfs_lsn_t lsn),
-	TP_ARGS(mp, agno, agbno, len, lsn),
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len, int found),
+	TP_ARGS(mp, agno, agbno, len, found),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(xfs_lsn_t, lsn)
+		__field(int, found)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->lsn = lsn;
+		__entry->found = found;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+	TP_printk("dev %d:%d agno %u agbno %u len %u %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
+		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+	TP_PROTO(struct xfs_trans *trans),
+	TP_ARGS(trans),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = trans->t_mountp->m_super->s_dev;
+		__entry->tp = trans;
+		__entry->lsn = trans->t_commit_lsn;
+	),
+	TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
 		  __entry->lsn)
 );
 
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df012..585e7633dfc 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
 	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
 		xfs_qm_dqinit_core(curid, type, d);
 	xfs_trans_dquot_buf(tp, bp,
-			    (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
-			    ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
-			     XFS_BLI_GDQUOT_BUF)));
+			    (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+			    ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+			     XFS_BLF_GDQUOT_BUF)));
 	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
 
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c..401f364ad36 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
 } xfs_agfl_t;
 
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
- * but whose transactions aren't committed to disk yet.
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
  */
-typedef struct xfs_perag_busy {
-	xfs_agblock_t	busy_start;
-	xfs_extlen_t	busy_length;
-	struct xfs_trans *busy_tp;	/* transaction that did the free */
-} xfs_perag_busy_t;
+struct xfs_busy_extent {
+	struct rb_node	rb_node;	/* ag by-bno indexed search tree */
+	struct list_head list;		/* transaction busy extent list */
+	xfs_agnumber_t	agno;
+	xfs_agblock_t	bno;
+	xfs_extlen_t	length;
+	xlog_tid_t	tid;		/* transaction that created this */
+};
 
 /*
  * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
 	xfs_agino_t	pagl_leftrec;
 	xfs_agino_t	pagl_rightrec;
 #ifdef __KERNEL__
-	spinlock_t	pagb_lock;	/* lock for pagb_list */
+	spinlock_t	pagb_lock;	/* lock for pagb_tree */
+	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 #endif
 	int		pagb_count;	/* pagb slots in use */
-	xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS];	/* unstable blocks */
 } xfs_perag_t;
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb256..a7fbe8a99b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len);
+static int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+		    xfs_agblock_t bno, xfs_extlen_t len);
 
 /*
  * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
 				be32_to_cpu(agf->agf_length));
 			xfs_alloc_log_agf(args->tp, args->agbp,
 						XFS_AGF_FREEBLKS);
-			/* search the busylist for these blocks */
-			xfs_alloc_search_busy(args->tp, args->agno,
-					args->agbno, args->len);
+			/*
+			 * Search the busylist for these blocks and mark the
+			 * transaction as synchronous if blocks are found. This
+			 * avoids the need to block due to a synchronous log
+			 * force to ensure correct ordering as the synchronous
+			 * transaction will guarantee that for us.
+			 */
+			if (xfs_alloc_busy_search(args->mp, args->agno,
+						args->agbno, args->len))
+				xfs_trans_set_sync(args->tp);
 		}
 		if (!args->isfl)
 			xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
 	 * when the iclog commits to disk.  If a busy block is allocated,
 	 * the iclog is pushed up to the LSN that freed the block.
 	 */
-	xfs_alloc_mark_busy(tp, agno, bno, len);
+	xfs_alloc_busy_insert(tp, agno, bno, len);
 	return 0;
 
  error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
 	*bnop = bno;
 
 	/*
-	 * As blocks are freed, they are added to the per-ag busy list
-	 * and remain there until the freeing transaction is committed to
-	 * disk.  Now that we have allocated blocks, this list must be
-	 * searched to see if a block is being reused.  If one is, then
-	 * the freeing transaction must be pushed to disk NOW by forcing
-	 * to disk all iclogs up that transaction's LSN.
+	 * As blocks are freed, they are added to the per-ag busy list and
+	 * remain there until the freeing transaction is committed to disk.
+	 * Now that we have allocated blocks, this list must be searched to see
+	 * if a block is being reused.  If one is, then the freeing transaction
+	 * must be pushed to disk before this transaction.
+	 *
+	 * We do this by setting the current transaction to a sync transaction
+	 * which guarantees that the freeing transaction is on disk before this
+	 * transaction. This is done instead of a synchronous log force here so
+	 * that we don't sit and wait with the AGF locked in the transaction
+	 * during the log force.
 	 */
-	xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+	if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+		xfs_trans_set_sync(tp);
 	return 0;
 }
 
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
 		spin_lock_init(&pag->pagb_lock);
 		pag->pagb_count = 0;
-		memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+		pag->pagb_tree = RB_ROOT;
 		pag->pagf_init = 1;
 	}
 #ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
  * list is reused, the transaction that freed it must be forced to disk
  * before continuing to use the block.
  *
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing.  When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log.  Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ *      Thread 1	Thread 2	Thread 3	xfslogd
+ *	xact alloc
+ *	free X
+ *	mark busy
+ *	commit xact
+ *	free xact
+ *			xact alloc
+ *			alloc X
+ *			busy search
+ *			mark xact sync
+ *			commit xact
+ *			free xact
+ *			force log
+ *			checkpoint starts
+ *			....
+ *					xact alloc
+ *					free X
+ *					mark busy
+ *					finds match
+ *					*** KABOOM! ***
+ *					....
+ *							log IO completes
+ *							unbusy X
+ *			checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction.  Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
  */
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len)
+xfs_alloc_busy_insert(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
 {
-	xfs_perag_busy_t	*bsy;
+	struct xfs_busy_extent	*new;
+	struct xfs_busy_extent	*busyp;
 	struct xfs_perag	*pag;
-	int			n;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	int			match;
 
-	pag = xfs_perag_get(tp->t_mountp, agno);
-	spin_lock(&pag->pagb_lock);
 
-	/* search pagb_list for an open slot */
-	for (bsy = pag->pagb_list, n = 0;
-	     n < XFS_PAGB_NUM_SLOTS;
-	     bsy++, n++) {
-		if (bsy->busy_tp == NULL) {
-			break;
-		}
+	new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+	if (!new) {
+		/*
+		 * No Memory!  Since it is now not possible to track the free
+		 * block, make this a synchronous transaction to insure that
+		 * the block is not reused before this transaction commits.
+		 */
+		trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+		xfs_trans_set_sync(tp);
+		return;
 	}
 
-	trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+	new->agno = agno;
+	new->bno = bno;
+	new->length = len;
+	new->tid = xfs_log_get_trans_ident(tp);
 
-	if (n < XFS_PAGB_NUM_SLOTS) {
-		bsy = &pag->pagb_list[n];
-		pag->pagb_count++;
-		bsy->busy_start = bno;
-		bsy->busy_length = len;
-		bsy->busy_tp = tp;
-		xfs_trans_add_busy(tp, agno, n);
-	} else {
+	INIT_LIST_HEAD(&new->list);
+
+	/* trace before insert to be able to see failed inserts */
+	trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+
+	pag = xfs_perag_get(tp->t_mountp, new->agno);
+restart:
+	spin_lock(&pag->pagb_lock);
+	rbp = &pag->pagb_tree.rb_node;
+	parent = NULL;
+	busyp = NULL;
+	match = 0;
+	while (*rbp && match >= 0) {
+		parent = *rbp;
+		busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+		if (new->bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			rbp = &(*rbp)->rb_left;
+			if (new->bno + new->length > busyp->bno)
+				match = busyp->tid == new->tid ? 1 : -1;
+		} else if (new->bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			rbp = &(*rbp)->rb_right;
+			if (bno < busyp->bno + busyp->length)
+				match = busyp->tid == new->tid ? 1 : -1;
+		} else {
+			match = busyp->tid == new->tid ? 1 : -1;
+			break;
+		}
+	}
+	if (match < 0) {
+		/* overlap marked busy in different transaction */
+		spin_unlock(&pag->pagb_lock);
+		xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+		goto restart;
+	}
+	if (match > 0) {
 		/*
-		 * The busy list is full!  Since it is now not possible to
-		 * track the free block, make this a synchronous transaction
-		 * to insure that the block is not reused before this
-		 * transaction commits.
+		 * overlap marked busy in same transaction. Update if exact
+		 * start block match, otherwise combine the busy extents into
+		 * a single range.
 		 */
-		xfs_trans_set_sync(tp);
-	}
+		if (busyp->bno == new->bno) {
+			busyp->length = max(busyp->length, new->length);
+			spin_unlock(&pag->pagb_lock);
+			ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+			xfs_perag_put(pag);
+			kmem_free(new);
+			return;
+		}
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		new->length = max(busyp->bno + busyp->length,
+					new->bno + new->length) -
+				min(busyp->bno, new->bno);
+		new->bno = min(busyp->bno, new->bno);
+	} else
+		busyp = NULL;
 
+	rb_link_node(&new->rb_node, parent, rbp);
+	rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+	list_add(&new->list, &tp->t_busy);
 	spin_unlock(&pag->pagb_lock);
 	xfs_perag_put(pag);
+	kmem_free(busyp);
 }
 
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-		     xfs_agnumber_t agno,
-		     int idx)
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate.  You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
 {
 	struct xfs_perag	*pag;
-	xfs_perag_busy_t	*list;
+	struct rb_node		*rbp;
+	struct xfs_busy_extent	*busyp;
+	int			match = 0;
 
-	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
-	pag = xfs_perag_get(tp->t_mountp, agno);
+	pag = xfs_perag_get(mp, agno);
 	spin_lock(&pag->pagb_lock);
-	list = pag->pagb_list;
 
-	trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
-
-	if (list[idx].busy_tp == tp) {
-		list[idx].busy_tp = NULL;
-		pag->pagb_count--;
+	rbp = pag->pagb_tree.rb_node;
+
+	/* find closest start bno overlap */
+	while (rbp) {
+		busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		if (bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			if (bno + len > busyp->bno)
+				match = -1;
+			rbp = rbp->rb_left;
+		} else if (bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			if (bno < busyp->bno + busyp->length)
+				match = -1;
+			rbp = rbp->rb_right;
+		} else {
+			/* bno matches busyp, length determines exact match */
+			match = (busyp->length == len) ? 1 : -1;
+			break;
+		}
 	}
-
 	spin_unlock(&pag->pagb_lock);
+	trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
 	xfs_perag_put(pag);
+	return match;
 }
 
-
-/*
- * If we find the extent in the busy list, force the log out to get the
- * extent out of the busy list so the caller can use it straight away.
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len)
+void
+xfs_alloc_busy_clear(
+	struct xfs_mount	*mp,
+	struct xfs_busy_extent	*busyp)
 {
 	struct xfs_perag	*pag;
-	xfs_perag_busy_t	*bsy;
-	xfs_agblock_t		uend, bend;
-	xfs_lsn_t		lsn = 0;
-	int			cnt;
 
-	pag = xfs_perag_get(tp->t_mountp, agno);
-	spin_lock(&pag->pagb_lock);
-	cnt = pag->pagb_count;
+	trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+						busyp->length);
 
-	/*
-	 * search pagb_list for this slot, skipping open slots. We have to
-	 * search the entire array as there may be multiple overlaps and
-	 * we have to get the most recent LSN for the log force to push out
-	 * all the transactions that span the range.
-	 */
-	uend = bno + len - 1;
-	for (cnt = 0; cnt < pag->pagb_count; cnt++) {
-		bsy = &pag->pagb_list[cnt];
-		if (!bsy->busy_tp)
-			continue;
+	ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+						busyp->length) == 1);
 
-		bend = bsy->busy_start + bsy->busy_length - 1;
-		if (bno > bend || uend < bsy->busy_start)
-			continue;
+	list_del_init(&busyp->list);
 
-		/* (start1,length1) within (start2, length2) */
-		if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
-			lsn = bsy->busy_tp->t_commit_lsn;
-	}
+	pag = xfs_perag_get(mp, busyp->agno);
+	spin_lock(&pag->pagb_lock);
+	rb_erase(&busyp->rb_node, &pag->pagb_tree);
 	spin_unlock(&pag->pagb_lock);
 	xfs_perag_put(pag);
-	trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
 
-	/*
-	 * If a block was found, force the log through the LSN of the
-	 * transaction that freed the block
-	 */
-	if (lsn)
-		xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
+	kmem_free(busyp);
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa3978..6d05199b667 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
+struct xfs_busy_extent;
 
 /*
  * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
 		xfs_agnumber_t agno,
 		xfs_agblock_t bno,
 		xfs_extlen_t len);
 
 void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-		xfs_agnumber_t ag,
-		int idx);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1..83f49421875 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
 	 * disk. If a busy block is allocated, the iclog is pushed up to the
 	 * LSN that freed the block.
 	 */
-	xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
 	return 0;
 }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727..02a80984aa0 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
 	nbytes = last - first + 1;
 	bfset(bip->bli_logged, first, nbytes);
 	for (x = 0; x < nbytes; x++) {
-		chunk_num = byte >> XFS_BLI_SHIFT;
+		chunk_num = byte >> XFS_BLF_SHIFT;
 		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
 		bit_num = chunk_num & (NBWORD - 1);
 		wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
 		 * cancel flag in it.
 		 */
 		trace_xfs_buf_item_size_stale(bip);
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		return 1;
 	}
 
@@ -197,9 +197,9 @@ xfs_buf_item_size(
 		} else if (next_bit != last_bit + 1) {
 			last_bit = next_bit;
 			nvecs++;
-		} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
-			   (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
-			    XFS_BLI_CHUNK)) {
+		} else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+			   (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+			    XFS_BLF_CHUNK)) {
 			last_bit = next_bit;
 			nvecs++;
 		} else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
 	vecp++;
 	nvecs = 1;
 
+	/*
+	 * If it is an inode buffer, transfer the in-memory state to the
+	 * format flags and clear the in-memory state. We do not transfer
+	 * this state if the inode buffer allocation has not yet been committed
+	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+	 * correct replay of the inode allocation.
+	 */
+	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+		if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+		      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+			bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+	}
+
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		/*
 		 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
 		 * cancel flag in it.
 		 */
 		trace_xfs_buf_item_format_stale(bip);
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		bip->bli_format.blf_size = nvecs;
 		return;
 	}
@@ -294,28 +308,28 @@ xfs_buf_item_format(
 		 * keep counting and scanning.
 		 */
 		if (next_bit == -1) {
-			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
 			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 			nvecs++;
 			break;
 		} else if (next_bit != last_bit + 1) {
-			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
 			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 			nvecs++;
 			vecp++;
 			first_bit = next_bit;
 			last_bit = next_bit;
 			nbits = 1;
-		} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
-			   (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
-			    XFS_BLI_CHUNK)) {
-			buffer_offset = first_bit * XFS_BLI_CHUNK;
+		} else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
+			   (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
+			    XFS_BLF_CHUNK)) {
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
 			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
  * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
 }
 
 /*
- * This is called to pin the buffer associated with the buf log
- * item in memory so it cannot be written out.  Simply call bpin()
- * on the buffer to do this.
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.  Simply call bpin() on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
  */
+
 STATIC void
 xfs_buf_item_pin(
 	xfs_buf_log_item_t	*bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
+	atomic_inc(&bip->bli_refcount);
 	trace_xfs_buf_item_pin(bip);
 	xfs_bpin(bp);
 }
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 		ASSERT(XFS_BUF_ISSTALE(bp));
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		trace_xfs_buf_item_unpin_stale(bip);
 
 		/*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
 }
 
 /*
- * Release the buffer associated with the buf log item.
- * If there is no dirty logged data associated with the
- * buffer recorded in the buf log item, then free the
- * buf log item and remove the reference to it in the
- * buffer.
+ * Release the buffer associated with the buf log item.  If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
  *
- * This call ignores the recursion count.  It is only called
- * when the buffer should REALLY be unlocked, regardless
- * of the recursion count.
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
  *
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
- * free the log item if necessary but do not unlock the buffer.
- * This is for support of xfs_trans_bhold(). Make sure the
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer.  This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
  */
 STATIC void
 xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
 
 	bp = bip->bli_buf;
 
-	/*
-	 * Clear the buffer's association with this transaction.
-	 */
+	/* Clear the buffer's association with this transaction. */
 	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
 
 	/*
-	 * If this is a transaction abort, don't return early.
-	 * Instead, allow the brelse to happen.
-	 * Normally it would be done for stale (cancelled) buffers
-	 * at unpin time, but we'll never go through the pin/unpin
-	 * cycle if we abort inside commit.
+	 * If this is a transaction abort, don't return early.  Instead, allow
+	 * the brelse to happen.  Normally it would be done for stale
+	 * (cancelled) buffers at unpin time, but we'll never go through the
+	 * pin/unpin cycle if we abort inside commit.
 	 */
 	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
 
 	/*
-	 * If the buf item is marked stale, then don't do anything.
-	 * We'll unlock the buffer and free the buf item when the
-	 * buffer is unpinned for the last time.
+	 * Before possibly freeing the buf item, determine if we should
+	 * release the buffer at the end of this routine.
 	 */
-	if (bip->bli_flags & XFS_BLI_STALE) {
-		bip->bli_flags &= ~XFS_BLI_LOGGED;
-		trace_xfs_buf_item_unlock_stale(bip);
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
-		if (!aborted)
-			return;
-	}
+	hold = bip->bli_flags & XFS_BLI_HOLD;
+
+	/* Clear the per transaction state. */
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
 
 	/*
-	 * Drop the transaction's reference to the log item if
-	 * it was not logged as part of the transaction.  Otherwise
-	 * we'll drop the reference in xfs_buf_item_unpin() when
-	 * the transaction is really through with the buffer.
+	 * If the buf item is marked stale, then don't do anything.  We'll
+	 * unlock the buffer and free the buf item when the buffer is unpinned
+	 * for the last time.
 	 */
-	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
-		atomic_dec(&bip->bli_refcount);
-	} else {
-		/*
-		 * Clear the logged flag since this is per
-		 * transaction state.
-		 */
-		bip->bli_flags &= ~XFS_BLI_LOGGED;
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		trace_xfs_buf_item_unlock_stale(bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+		if (!aborted) {
+			atomic_dec(&bip->bli_refcount);
+			return;
+		}
 	}
 
-	/*
-	 * Before possibly freeing the buf item, determine if we should
-	 * release the buffer at the end of this routine.
-	 */
-	hold = bip->bli_flags & XFS_BLI_HOLD;
 	trace_xfs_buf_item_unlock(bip);
 
 	/*
-	 * If the buf item isn't tracking any data, free it.
-	 * Otherwise, if XFS_BLI_HOLD is set clear it.
+	 * If the buf item isn't tracking any data, free it, otherwise drop the
+	 * reference we hold to it.
 	 */
 	if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
-			     bip->bli_format.blf_map_size)) {
+			     bip->bli_format.blf_map_size))
 		xfs_buf_item_relse(bp);
-	} else if (hold) {
-		bip->bli_flags &= ~XFS_BLI_HOLD;
-	}
+	else
+		atomic_dec(&bip->bli_refcount);
 
-	/*
-	 * Release the buffer if XFS_BLI_HOLD was not set.
-	 */
-	if (!hold) {
+	if (!hold)
 		xfs_buf_relse(bp);
-	}
 }
 
 /*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
 	}
 
 	/*
-	 * chunks is the number of XFS_BLI_CHUNK size pieces
+	 * chunks is the number of XFS_BLF_CHUNK size pieces
 	 * the buffer can be divided into. Make sure not to
 	 * truncate any pieces.  map_size is the size of the
 	 * bitmap needed to describe the chunks of the buffer.
 	 */
-	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
 	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
 
 	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
 	/*
 	 * Convert byte offsets to bit numbers.
 	 */
-	first_bit = first >> XFS_BLI_SHIFT;
-	last_bit = last >> XFS_BLI_SHIFT;
+	first_bit = first >> XFS_BLF_SHIFT;
+	last_bit = last >> XFS_BLF_SHIFT;
 
 	/*
 	 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f7..f20bb472d58 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
  * This flag indicates that the buffer contains on disk inodes
  * and requires special recovery handling.
  */
-#define	XFS_BLI_INODE_BUF	0x1
+#define	XFS_BLF_INODE_BUF	0x1
 /*
  * This flag indicates that the buffer should not be replayed
  * during recovery because its blocks are being freed.
  */
-#define	XFS_BLI_CANCEL		0x2
+#define	XFS_BLF_CANCEL		0x2
 /*
  * This flag indicates that the buffer contains on disk
  * user or group dquots and may require special recovery handling.
  */
-#define	XFS_BLI_UDQUOT_BUF	0x4
-#define XFS_BLI_PDQUOT_BUF	0x8
-#define	XFS_BLI_GDQUOT_BUF	0x10
+#define	XFS_BLF_UDQUOT_BUF	0x4
+#define XFS_BLF_PDQUOT_BUF	0x8
+#define	XFS_BLF_GDQUOT_BUF	0x10
 
-#define	XFS_BLI_CHUNK		128
-#define	XFS_BLI_SHIFT		7
+#define	XFS_BLF_CHUNK		128
+#define	XFS_BLF_SHIFT		7
 #define	BIT_TO_WORD_SHIFT	5
 #define	NBWORD			(NBBY * sizeof(unsigned int))
 
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
 #define	XFS_BLI_LOGGED		0x08
 #define	XFS_BLI_INODE_ALLOC_BUF	0x10
 #define XFS_BLI_STALE_INODE	0x20
+#define	XFS_BLI_INODE_BUF	0x40
 
 #define XFS_BLI_FLAGS \
 	{ XFS_BLI_HOLD,		"HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
 	{ XFS_BLI_STALE,	"STALE" }, \
 	{ XFS_BLI_LOGGED,	"LOGGED" }, \
 	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
-	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }
+	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }, \
+	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }
 
 
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c074..047b8a8e5c2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 	va_list ap;
 
 #ifdef DEBUG
-	xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+	xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
 #endif
 
 	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72..5215abc8023 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void	 xlog_dealloc_log(xlog_t *log);
-STATIC int	 xlog_write(struct log *log, struct xfs_log_vec *log_vector,
-			    struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
-			    xlog_in_core_t **commit_iclog, uint flags);
 
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t		*log,
 STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 				   xlog_ticket_t *ticket);
 
-
-/* local ticket functions */
-STATIC xlog_ticket_t	*xlog_ticket_alloc(xlog_t *log,
-					 int	unit_bytes,
-					 int	count,
-					 char	clientid,
-					 uint	flags);
-
 #if defined(DEBUG)
 STATIC void	xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void	xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
 		ASSERT(flags & XFS_LOG_PERM_RESERV);
 		internal_ticket = *ticket;
 
+		/*
+		 * this is a new transaction on the ticket, so we need to
+		 * change the transaction ID so that the next transaction has a
+		 * different TID in the log. Just add one to the existing tid
+		 * so that we can see chains of rolling transactions in the log
+		 * easily.
+		 */
+		internal_ticket->t_tid++;
+
 		trace_xfs_log_reserve(log, internal_ticket);
 
 		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
 	} else {
 		/* may sleep if need to allocate more tickets */
 		internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-						  client, flags);
+						  client, flags,
+						  KM_SLEEP|KM_MAYFAIL);
 		if (!internal_ticket)
 			return XFS_ERROR(ENOMEM);
 		internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
 	/* Normal transactions can now occur */
 	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 
+	/*
+	 * Now the log has been fully initialised and we know were our
+	 * space grant counters are, we can initialise the permanent ticket
+	 * needed for delayed logging to work.
+	 */
+	xlog_cil_init_post_recovery(mp->m_log);
+
 	return 0;
 
 out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
 	item->li_ailp = mp->m_ail;
 	item->li_type = type;
 	item->li_ops = ops;
+	item->li_lv = NULL;
+
+	INIT_LIST_HEAD(&item->li_ail);
+	INIT_LIST_HEAD(&item->li_cil);
 }
 
 /*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	*iclogp = log->l_iclog;			/* complete ring */
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
+	error = xlog_cil_init(log);
+	if (error)
+		goto out_free_iclog;
 	return log;
 
 out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
 	xlog_in_core_t	*iclog, *next_iclog;
 	int		i;
 
+	xlog_cil_destroy(log);
+
 	iclog = log->l_iclog;
 	for (i=0; i<log->l_iclog_bufs; i++) {
 		sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t		*log,
  * print out info relating to regions written which consume
  * the reservation
  */
-STATIC void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+void
+xlog_print_tic_res(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket)
 {
 	uint i;
 	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 			    "bad-rtype" : res_type_str[r_type-1]),
 			    ticket->t_res_arr[i].r_len);
 	}
+
+	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+		"xfs_log_write: reservation ran out. Need to up reservation");
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
 
 /*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
  *	we don't update ic_offset until the end when we know exactly how many
  *	bytes have been written out.
  */
-STATIC int
+int
 xlog_write(
 	struct log		*log,
 	struct xfs_log_vec	*log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
 	*start_lsn = 0;
 
 	len = xlog_write_calc_vec_length(ticket, log_vector);
-	if (ticket->t_curr_res < len) {
-		xlog_print_tic_res(log->l_mp, ticket);
-#ifdef DEBUG
-		xlog_panic(
-	"xfs_log_write: reservation ran out. Need to up reservation");
-#else
-		/* Customer configurable panic */
-		xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp,
-	"xfs_log_write: reservation ran out. Need to up reservation");
+	if (log->l_cilp) {
+		/*
+		 * Region headers and bytes are already accounted for.
+		 * We only need to take into account start records and
+		 * split regions in this function.
+		 */
+		if (ticket->t_flags & XLOG_TIC_INITED)
+			ticket->t_curr_res -= sizeof(xlog_op_header_t);
 
-		/* If we did not panic, shutdown the filesystem */
-		xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE);
-#endif
-	}
+		/*
+		 * Commit record headers need to be accounted for. These
+		 * come in as separate writes so are easy to detect.
+		 */
+		if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+			ticket->t_curr_res -= sizeof(xlog_op_header_t);
+	} else
+		ticket->t_curr_res -= len;
 
-	ticket->t_curr_res -= len;
+	if (ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, ticket);
 
 	index = 0;
 	lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
 
 	XFS_STATS_INC(xs_log_force);
 
+	xlog_cil_push(log, 1);
+
 	spin_lock(&log->l_icloglock);
 
 	iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
 
 	XFS_STATS_INC(xs_log_force);
 
+	if (log->l_cilp) {
+		lsn = xlog_cil_push_lsn(log, lsn);
+		if (lsn == NULLCOMMITLSN)
+			return 0;
+	}
+
 try_again:
 	spin_lock(&log->l_icloglock);
 	iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
 	return ticket;
 }
 
+xlog_tid_t
+xfs_log_get_trans_ident(
+	struct xfs_trans	*tp)
+{
+	return tp->t_ticket->t_tid;
+}
+
 /*
  * Allocate and initialise a new log ticket.
  */
-STATIC xlog_ticket_t *
+xlog_ticket_t *
 xlog_ticket_alloc(
 	struct log	*log,
 	int		unit_bytes,
 	int		cnt,
 	char		client,
-	uint		xflags)
+	uint		xflags,
+	int		alloc_flags)
 {
 	struct xlog_ticket *tic;
 	uint		num_headers;
 	int		iclog_space;
 
-	tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+	tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
 	if (!tic)
 		return NULL;
 
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
  *	c. nothing new gets queued up after (a) and (b) are done.
  *	d. if !logerror, flush the iclogs to disk, then seal them off
  *	   for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
  */
 int
 xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
 		return 1;
 	}
 	retval = 0;
+
+	/*
+	 * Flush the in memory commit item list before marking the log as
+	 * being shut down. We need to do it in this order to ensure all the
+	 * completed transactions are flushed to disk with the xfs_log_force()
+	 * call below.
+	 */
+	if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+		xlog_cil_push(log, 1);
+
 	/*
 	 * We must hold both the GRANT lock and the LOG lock,
 	 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9..04c78e642cc 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_H__
 
 /* get lsn fields */
-
 #define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
 #define BLOCK_LSN(lsn) ((uint)(lsn))
 
@@ -114,6 +113,9 @@ struct xfs_log_vec {
 	struct xfs_log_vec	*lv_next;	/* next lv in build list */
 	int			lv_niovecs;	/* number of iovecs in lv */
 	struct xfs_log_iovec	*lv_iovecp;	/* iovec array */
+	struct xfs_log_item	*lv_item;	/* owner */
+	char			*lv_buf;	/* formatted buffer */
+	int			lv_buf_len;	/* size of formatted buffer */
 };
 
 /*
@@ -134,6 +136,7 @@ struct xlog_in_core;
 struct xlog_ticket;
 struct xfs_log_item;
 struct xfs_item_ops;
+struct xfs_trans;
 
 void	xfs_log_item_init(struct xfs_mount	*mp,
 			struct xfs_log_item	*item,
@@ -187,9 +190,16 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 
 void	  xlog_iodone(struct xfs_buf *);
 
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+
+int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+				struct xfs_log_vec *log_vector,
+				xfs_lsn_t *commit_lsn, int flags);
+bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
 #endif
 
 
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 00000000000..bb17cc044bf
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+	struct log	*log)
+{
+	struct xfs_cil	*cil;
+	struct xfs_cil_ctx *ctx;
+
+	log->l_cilp = NULL;
+	if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+		return 0;
+
+	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	if (!cil)
+		return ENOMEM;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	if (!ctx) {
+		kmem_free(cil);
+		return ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&cil->xc_cil);
+	INIT_LIST_HEAD(&cil->xc_committing);
+	spin_lock_init(&cil->xc_cil_lock);
+	init_rwsem(&cil->xc_ctx_lock);
+	sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+
+	INIT_LIST_HEAD(&ctx->committing);
+	INIT_LIST_HEAD(&ctx->busy_extents);
+	ctx->sequence = 1;
+	ctx->cil = cil;
+	cil->xc_ctx = ctx;
+
+	cil->xc_log = log;
+	log->l_cilp = cil;
+	return 0;
+}
+
+void
+xlog_cil_destroy(
+	struct log	*log)
+{
+	if (!log->l_cilp)
+		return;
+
+	if (log->l_cilp->xc_ctx) {
+		if (log->l_cilp->xc_ctx->ticket)
+			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+		kmem_free(log->l_cilp->xc_ctx);
+	}
+
+	ASSERT(list_empty(&log->l_cilp->xc_cil));
+	kmem_free(log->l_cilp);
+}
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+	struct log	*log)
+{
+	struct xlog_ticket *tic;
+
+	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+				KM_SLEEP|KM_NOFS);
+	tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+
+	/*
+	 * set the current reservation to zero so we know to steal the basic
+	 * transaction overhead reservation from the first transaction commit.
+	 */
+	tic->t_curr_res = 0;
+	return tic;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+	struct log	*log)
+{
+	if (!log->l_cilp)
+		return;
+
+	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+	log->l_cilp->xc_ctx->sequence = 1;
+	log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+								log->l_curr_block);
+}
+
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+	struct log		*log,
+	struct xlog_ticket	*ticket,
+	struct xfs_log_item	*item,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_vec	*old = lv->lv_item->li_lv;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	int			len;
+	int			diff_iovecs;
+	int			iclog_space;
+
+	if (old) {
+		/* existing lv on log item, space used is a delta */
+		ASSERT(!list_empty(&item->li_cil));
+		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+		len = lv->lv_buf_len - old->lv_buf_len;
+		diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+		kmem_free(old->lv_buf);
+		kmem_free(old);
+	} else {
+		/* new lv, must pin the log item */
+		ASSERT(!lv->lv_item->li_lv);
+		ASSERT(list_empty(&item->li_cil));
+
+		len = lv->lv_buf_len;
+		diff_iovecs = lv->lv_niovecs;
+		IOP_PIN(lv->lv_item);
+
+	}
+	len += diff_iovecs * sizeof(xlog_op_header_t);
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	spin_lock(&cil->xc_cil_lock);
+	list_move_tail(&item->li_cil, &cil->xc_cil);
+	ctx->nvecs += diff_iovecs;
+
+	/*
+	 * If this is the first time the item is being committed to the CIL,
+	 * store the sequence number on the log item so we can tell
+	 * in future commits whether this is the first checkpoint the item is
+	 * being committed into.
+	 */
+	if (!item->li_seq)
+		item->li_seq = ctx->sequence;
+
+	/*
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
+	 */
+	if (ctx->ticket->t_curr_res == 0) {
+		/* first commit in checkpoint, steal the header reservation */
+		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		ticket->t_curr_res -= hdrs;
+		ASSERT(ticket->t_curr_res >= len);
+	}
+	ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket,
+	xfs_lsn_t		*start_lsn)
+{
+	struct xfs_log_vec *lv;
+
+	if (start_lsn)
+		*start_lsn = log->l_cilp->xc_ctx->sequence;
+
+	ASSERT(log_vector);
+	for (lv = log_vector; lv; lv = lv->lv_next) {
+		void	*ptr;
+		int	index;
+		int	len = 0;
+
+		/* build the vector array and calculate it's length */
+		IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+		for (index = 0; index < lv->lv_niovecs; index++)
+			len += lv->lv_iovecp[index].i_len;
+
+		lv->lv_buf_len = len;
+		lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+		ptr = lv->lv_buf;
+
+		for (index = 0; index < lv->lv_niovecs; index++) {
+			struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+			memcpy(ptr, vec->i_addr, vec->i_len);
+			vec->i_addr = ptr;
+			ptr += vec->i_len;
+		}
+		ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+
+		xlog_cil_insert(log, ticket, lv->lv_item, lv);
+	}
+}
+
+static void
+xlog_cil_free_logvec(
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+
+	for (lv = log_vector; lv; ) {
+		struct xfs_log_vec *next = lv->lv_next;
+		kmem_free(lv->lv_buf);
+		kmem_free(lv);
+		lv = next;
+	}
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct log		*log = mp->m_log;
+	int			log_flags = 0;
+	int			push = 0;
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		xlog_cil_free_logvec(log_vector);
+		return XFS_ERROR(EIO);
+	}
+
+	/* lock out background commit */
+	down_read(&log->l_cilp->xc_ctx_lock);
+	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+
+	/* check we didn't blow the reservation */
+	if (tp->t_ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy)) {
+		spin_lock(&log->l_cilp->xc_cil_lock);
+		list_splice_init(&tp->t_busy,
+					&log->l_cilp->xc_ctx->busy_extents);
+		spin_unlock(&log->l_cilp->xc_cil_lock);
+	}
+
+	tp->t_commit_lsn = *commit_lsn;
+	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/* check for background commit before unlock */
+	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+		push = 1;
+	up_read(&log->l_cilp->xc_ctx_lock);
+
+	/*
+	 * We need to push CIL every so often so we don't cache more than we
+	 * can fit in the log. The limit really is that a checkpoint can't be
+	 * more than half the log (the current checkpoint is not allowed to
+	 * overwrite the previous checkpoint), but commit latency and memory
+	 * usage limit this to a smaller size in most cases.
+	 */
+	if (push)
+		xlog_cil_push(log, 0);
+	return 0;
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+	void	*args,
+	int	abort)
+{
+	struct xfs_cil_ctx	*ctx = args;
+	struct xfs_log_vec	*lv;
+	int			abortflag = abort ? XFS_LI_ABORTED : 0;
+	struct xfs_busy_extent	*busyp, *n;
+
+	/* unpin all the log items */
+	for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+		xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+							abortflag);
+	}
+
+	list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+		xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+
+	spin_lock(&ctx->cil->xc_cil_lock);
+	list_del(&ctx->committing);
+	spin_unlock(&ctx->cil->xc_cil_lock);
+
+	xlog_cil_free_logvec(ctx->lv_chain);
+	kmem_free(ctx);
+}
+
+/*
+ * Push the Committed Item List to the log. If the push_now flag is not set,
+ * then it is a background flush and so we can chose to ignore it.
+ */
+int
+xlog_cil_push(
+	struct log		*log,
+	int			push_now)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_vec	*lv;
+	struct xfs_cil_ctx	*ctx;
+	struct xfs_cil_ctx	*new_ctx;
+	struct xlog_in_core	*commit_iclog;
+	struct xlog_ticket	*tic;
+	int			num_lv;
+	int			num_iovecs;
+	int			len;
+	int			error = 0;
+	struct xfs_trans_header thdr;
+	struct xfs_log_iovec	lhdr;
+	struct xfs_log_vec	lvhdr = { NULL };
+	xfs_lsn_t		commit_lsn;
+
+	if (!cil)
+		return 0;
+
+	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+	new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+	/* lock out transaction commit, but don't block on background push */
+	if (!down_write_trylock(&cil->xc_ctx_lock)) {
+		if (!push_now)
+			goto out_free_ticket;
+		down_write(&cil->xc_ctx_lock);
+	}
+	ctx = cil->xc_ctx;
+
+	/* check if we've anything to push */
+	if (list_empty(&cil->xc_cil))
+		goto out_skip;
+
+	/* check for spurious background flush */
+	if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		goto out_skip;
+
+	/*
+	 * pull all the log vectors off the items in the CIL, and
+	 * remove the items from the CIL. We don't need the CIL lock
+	 * here because it's only needed on the transaction commit
+	 * side which is currently locked out by the flush lock.
+	 */
+	lv = NULL;
+	num_lv = 0;
+	num_iovecs = 0;
+	len = 0;
+	while (!list_empty(&cil->xc_cil)) {
+		struct xfs_log_item	*item;
+		int			i;
+
+		item = list_first_entry(&cil->xc_cil,
+					struct xfs_log_item, li_cil);
+		list_del_init(&item->li_cil);
+		if (!ctx->lv_chain)
+			ctx->lv_chain = item->li_lv;
+		else
+			lv->lv_next = item->li_lv;
+		lv = item->li_lv;
+		item->li_lv = NULL;
+
+		num_lv++;
+		num_iovecs += lv->lv_niovecs;
+		for (i = 0; i < lv->lv_niovecs; i++)
+			len += lv->lv_iovecp[i].i_len;
+	}
+
+	/*
+	 * initialise the new context and attach it to the CIL. Then attach
+	 * the current context to the CIL committing lsit so it can be found
+	 * during log forces to extract the commit lsn of the sequence that
+	 * needs to be forced.
+	 */
+	INIT_LIST_HEAD(&new_ctx->committing);
+	INIT_LIST_HEAD(&new_ctx->busy_extents);
+	new_ctx->sequence = ctx->sequence + 1;
+	new_ctx->cil = cil;
+	cil->xc_ctx = new_ctx;
+
+	/*
+	 * The switch is now done, so we can drop the context lock and move out
+	 * of a shared context. We can't just go straight to the commit record,
+	 * though - we need to synchronise with previous and future commits so
+	 * that the commit records are correctly ordered in the log to ensure
+	 * that we process items during log IO completion in the correct order.
+	 *
+	 * For example, if we get an EFI in one checkpoint and the EFD in the
+	 * next (e.g. due to log forces), we do not want the checkpoint with
+	 * the EFD to be committed before the checkpoint with the EFI.  Hence
+	 * we must strictly order the commit records of the checkpoints so
+	 * that: a) the checkpoint callbacks are attached to the iclogs in the
+	 * correct order; and b) the checkpoints are replayed in correct order
+	 * in log recovery.
+	 *
+	 * Hence we need to add this context to the committing context list so
+	 * that higher sequences will wait for us to write out a commit record
+	 * before they do.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_cil_lock);
+	up_write(&cil->xc_ctx_lock);
+
+	/*
+	 * Build a checkpoint transaction header and write it to the log to
+	 * begin the transaction. We need to account for the space used by the
+	 * transaction header here as it is not accounted for in xlog_write().
+	 *
+	 * The LSN we need to pass to the log items on transaction commit is
+	 * the LSN reported by the first log vector write. If we use the commit
+	 * record lsn then we can move the tail beyond the grant write head.
+	 */
+	tic = ctx->ticket;
+	thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+	thdr.th_type = XFS_TRANS_CHECKPOINT;
+	thdr.th_tid = tic->t_tid;
+	thdr.th_num_items = num_iovecs;
+	lhdr.i_addr = (xfs_caddr_t)&thdr;
+	lhdr.i_len = sizeof(xfs_trans_header_t);
+	lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+	tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+
+	lvhdr.lv_niovecs = 1;
+	lvhdr.lv_iovecp = &lhdr;
+	lvhdr.lv_next = ctx->lv_chain;
+
+	error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+	if (error)
+		goto out_abort;
+
+	/*
+	 * now that we've written the checkpoint into the log, strictly
+	 * order the commit records so replay will get them in the right order.
+	 */
+restart:
+	spin_lock(&cil->xc_cil_lock);
+	list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+		/*
+		 * Higher sequences will wait for this one so skip them.
+		 * Don't wait for own own sequence, either.
+		 */
+		if (new_ctx->sequence >= ctx->sequence)
+			continue;
+		if (!new_ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+			goto restart;
+		}
+	}
+	spin_unlock(&cil->xc_cil_lock);
+
+	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+	if (error || commit_lsn == -1)
+		goto out_abort;
+
+	/* attach all the transactions w/ busy extents to iclog */
+	ctx->log_cb.cb_func = xlog_cil_committed;
+	ctx->log_cb.cb_arg = ctx;
+	error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+	if (error)
+		goto out_abort;
+
+	/*
+	 * now the checkpoint commit is complete and we've attached the
+	 * callbacks to the iclog we can assign the commit LSN to the context
+	 * and wake up anyone who is waiting for the commit to complete.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	ctx->commit_lsn = commit_lsn;
+	sv_broadcast(&cil->xc_commit_wait);
+	spin_unlock(&cil->xc_cil_lock);
+
+	/* release the hounds! */
+	return xfs_log_release_iclog(log->l_mp, commit_iclog);
+
+out_skip:
+	up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+	xfs_log_ticket_put(new_ctx->ticket);
+	kmem_free(new_ctx);
+	return 0;
+
+out_abort:
+	xlog_cil_committed(ctx, XFS_LI_ABORTED);
+	return XFS_ERROR(EIO);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_push_lsn(
+	struct log	*log,
+	xfs_lsn_t	push_seq)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx;
+	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
+
+restart:
+	down_write(&cil->xc_ctx_lock);
+	ASSERT(push_seq <= cil->xc_ctx->sequence);
+
+	/* check to see if we need to force out the current context */
+	if (push_seq == cil->xc_ctx->sequence) {
+		up_write(&cil->xc_ctx_lock);
+		xlog_cil_push(log, 1);
+		goto restart;
+	}
+
+	/*
+	 * See if we can find a previous sequence still committing.
+	 * We can drop the flush lock as soon as we have the cil lock
+	 * because we are now only comparing contexts protected by
+	 * the cil lock.
+	 *
+	 * We need to wait for all previous sequence commits to complete
+	 * before allowing the force of push_seq to go ahead. Hence block
+	 * on commits for those as well.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	up_write(&cil->xc_ctx_lock);
+	list_for_each_entry(ctx, &cil->xc_committing, committing) {
+		if (ctx->sequence > push_seq)
+			continue;
+		if (!ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+			goto restart;
+		}
+		if (ctx->sequence != push_seq)
+			continue;
+		/* found it! */
+		commit_lsn = ctx->commit_lsn;
+	}
+	spin_unlock(&cil->xc_cil_lock);
+	return commit_lsn;
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+	struct xfs_log_item *lip)
+{
+	struct xfs_cil_ctx *ctx;
+
+	if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+		return false;
+	if (list_empty(&lip->li_cil))
+		return false;
+
+	ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+
+	/*
+	 * li_seq is written on the first commit of a log item to record the
+	 * first checkpoint it is written to. Hence if it is different to the
+	 * current sequence, we're in a new checkpoint.
+	 */
+	if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+		return false;
+	return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf69515445..8c072618965 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
 #define	XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
 #define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
 					   shutdown */
-typedef __uint32_t xlog_tid_t;
-
 
 #ifdef __KERNEL__
 /*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
 } xlog_in_core_t;
 
 /*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+	struct xfs_cil		*cil;
+	xfs_lsn_t		sequence;	/* chkpt sequence # */
+	xfs_lsn_t		start_lsn;	/* first LSN of chkpt commit */
+	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
+	struct xlog_ticket	*ticket;	/* chkpt ticket */
+	int			nvecs;		/* number of regions */
+	int			space_used;	/* aggregate size of regions */
+	struct list_head	busy_extents;	/* busy extents in chkpt */
+	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
+	xfs_log_callback_t	log_cb;		/* completion callback hook. */
+	struct list_head	committing;	/* ctx committing list */
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+	struct log		*xc_log;
+	struct list_head	xc_cil;
+	spinlock_t		xc_cil_lock;
+	struct xfs_cil_ctx	*xc_ctx;
+	struct rw_semaphore	xc_ctx_lock;
+	struct list_head	xc_committing;
+	sv_t			xc_commit_wait;
+};
+
+/*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+
+#define XLOG_CIL_SPACE_LIMIT(log)	\
+	(min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+
+/*
  * The reservation head lsn is not made up of a cycle number and block number.
  * Instead, it uses a cycle number and byte number.  Logs don't expect to
  * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
 	/* The following fields don't need locking */
 	struct xfs_mount	*l_mp;	        /* mount point */
 	struct xfs_ail		*l_ailp;	/* AIL log is working with */
+	struct xfs_cil		*l_cilp;	/* CIL log is working with */
 	struct xfs_buf		*l_xbuf;        /* extra buffer for log
 						 * wrapping */
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
 
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
 
-
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int	 xlog_recover(xlog_t *log);
 extern int	 xlog_recover_finish(xlog_t *log);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 
-extern kmem_zone_t	*xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+				int count, char client, uint xflags,
+				int alloc_flags);
+
 
 static inline void
 xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
 	*off += bytes;
 }
 
+void	xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+				struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+				xlog_in_core_t **commit_iclog, uint flags);
+
+/*
+ * Committed Item List interfaces
+ */
+int	xlog_cil_init(struct log *log);
+void	xlog_cil_init_post_recovery(struct log *log);
+void	xlog_cil_destroy(struct log *log);
+
+int	xlog_cil_push(struct log *log, int push_now);
+xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+
 /*
  * Unmount record type is used as a pseudo transaction type for the ticket.
  * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e36631..14a69aec2c0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans(
 
 		switch (ITEM_TYPE(item)) {
 		case XFS_LI_BUF:
-			if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+			if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
 				trace_xfs_log_recover_item_reorder_head(log,
 							trans, item, pass);
 				list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1(
 	/*
 	 * If this isn't a cancel buffer item, then just return.
 	 */
-	if (!(flags & XFS_BLI_CANCEL)) {
+	if (!(flags & XFS_BLF_CANCEL)) {
 		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
 		return;
 	}
@@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1(
  * Check to see whether the buffer being recovered has a corresponding
  * entry in the buffer cancel record table.  If it does then return 1
  * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
  * the refcount on the entry in the table and remove it from the table
  * if this is the last reference.
  *
@@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled(
 		 * There is nothing in the table built in pass one,
 		 * so this buffer must not be cancelled.
 		 */
-		ASSERT(!(flags & XFS_BLI_CANCEL));
+		ASSERT(!(flags & XFS_BLF_CANCEL));
 		return 0;
 	}
 
@@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled(
 		 * There is no corresponding entry in the table built
 		 * in pass one, so this buffer has not been cancelled.
 		 */
-		ASSERT(!(flags & XFS_BLI_CANCEL));
+		ASSERT(!(flags & XFS_BLF_CANCEL));
 		return 0;
 	}
 
@@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled(
 			 * one in the table and remove it if this is the
 			 * last reference.
 			 */
-			if (flags & XFS_BLI_CANCEL) {
+			if (flags & XFS_BLF_CANCEL) {
 				bcp->bc_refcount--;
 				if (bcp->bc_refcount == 0) {
 					if (prevp == NULL) {
@@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled(
 	 * We didn't find a corresponding entry in the table, so
 	 * return 0 so that the buffer is NOT cancelled.
 	 */
-	ASSERT(!(flags & XFS_BLI_CANCEL));
+	ASSERT(!(flags & XFS_BLF_CANCEL));
 	return 0;
 }
 
@@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer(
 			nbits = xfs_contig_bits(data_map, map_size,
 							 bit);
 			ASSERT(nbits > 0);
-			reg_buf_offset = bit << XFS_BLI_SHIFT;
-			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+			reg_buf_offset = bit << XFS_BLF_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
 			item_index++;
 		}
 
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
 		}
 
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
-		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
 		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
 
 		/*
@@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer(
 		nbits = xfs_contig_bits(data_map, map_size, bit);
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
-		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
 		ASSERT(XFS_BUF_COUNT(bp) >=
-		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+		       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
 
 		/*
 		 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer(
 		 */
 		error = 0;
 		if (buf_f->blf_flags &
-		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 			if (item->ri_buf[i].i_addr == NULL) {
 				cmn_err(CE_ALERT,
 					"XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer(
 		}
 
 		memcpy(xfs_buf_offset(bp,
-			(uint)bit << XFS_BLI_SHIFT),	/* dest */
+			(uint)bit << XFS_BLF_SHIFT),	/* dest */
 			item->ri_buf[i].i_addr,		/* source */
-			nbits<<XFS_BLI_SHIFT);		/* length */
+			nbits<<XFS_BLF_SHIFT);		/* length */
  next:
 		i++;
 		bit += nbits;
@@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
 	}
 
 	type = 0;
-	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
 		type |= XFS_DQ_USER;
-	if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
 		type |= XFS_DQ_PROJ;
-	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
 		type |= XFS_DQ_GROUP;
 	/*
 	 * This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
  * here which overlaps that may be stale.
  *
  * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
  * of the buffer in the log should not be replayed at recovery time.
  * This is so that if the blocks covered by the buffer are reused for
  * file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans(
 	if (pass == XLOG_RECOVER_PASS1) {
 		/*
 		 * In this pass we're only looking for buf items
-		 * with the XFS_BLI_CANCEL bit set.
+		 * with the XFS_BLF_CANCEL bit set.
 		 */
 		xlog_recover_do_buffer_pass1(log, buf_f);
 		return 0;
@@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans(
 
 	mp = log->l_mp;
 	buf_flags = XBF_LOCK;
-	if (!(flags & XFS_BLI_INODE_BUF))
+	if (!(flags & XFS_BLF_INODE_BUF))
 		buf_flags |= XBF_MAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans(
 	}
 
 	error = 0;
-	if (flags & XFS_BLI_INODE_BUF) {
+	if (flags & XFS_BLF_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
 	} else if (flags &
-		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
 		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d74920725..1c55ccbb379 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
 #define XLOG_RHASH(tid)	\
 	((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
 
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
 
 
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7e..1d2c7eed4ed 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC		(1ULL << 0)	/* for nfs - all metadata ops
 						   must be synchronous except
 						   for space allocations */
+#define XFS_MOUNT_DELAYLOG	(1ULL << 1)	/* delayed logging is enabled */
 #define XFS_MOUNT_DMAPI		(1ULL << 2)	/* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af..ce558efa2ea 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,6 +44,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 
 kmem_zone_t	*xfs_trans_zone;
 
@@ -243,9 +244,8 @@ _xfs_trans_alloc(
 	tp->t_type = type;
 	tp->t_mountp = mp;
 	tp->t_items_free = XFS_LIC_NUM_SLOTS;
-	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
 	xfs_lic_init(&(tp->t_items));
-	XFS_LBC_INIT(&(tp->t_busy));
+	INIT_LIST_HEAD(&tp->t_busy);
 	return tp;
 }
 
@@ -255,8 +255,13 @@ _xfs_trans_alloc(
  */
 STATIC void
 xfs_trans_free(
-	xfs_trans_t	*tp)
+	struct xfs_trans	*tp)
 {
+	struct xfs_busy_extent	*busyp, *n;
+
+	list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+		xfs_alloc_busy_clear(tp->t_mountp, busyp);
+
 	atomic_dec(&tp->t_mountp->m_active_trans);
 	xfs_trans_free_dqinfo(tp);
 	kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +290,8 @@ xfs_trans_dup(
 	ntp->t_type = tp->t_type;
 	ntp->t_mountp = tp->t_mountp;
 	ntp->t_items_free = XFS_LIC_NUM_SLOTS;
-	ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
 	xfs_lic_init(&(ntp->t_items));
-	XFS_LBC_INIT(&(ntp->t_busy));
+	INIT_LIST_HEAD(&ntp->t_busy);
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(tp->t_ticket != NULL);
@@ -423,7 +427,6 @@ undo_blocks:
 	return error;
 }
 
-
 /*
  * Record the indicated change to the given field for application
  * to the file system's superblock when the transaction commits.
@@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas(
  * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
  * still need to update the incore superblock with the changes.
  */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_sb(
 	xfs_trans_t	*tp)
 {
@@ -880,7 +883,7 @@ xfs_trans_fill_vecs(
  * they could be immediately flushed and we'd have to race with the flusher
  * trying to pull the item from the AIL as we add it.
  */
-static void
+void
 xfs_trans_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		commit_lsn,
@@ -930,26 +933,6 @@ xfs_trans_item_committed(
 	IOP_UNPIN(lip);
 }
 
-/* Clear all the per-AG busy list items listed in this transaction */
-static void
-xfs_trans_clear_busy_extents(
-	struct xfs_trans	*tp)
-{
-	xfs_log_busy_chunk_t	*lbcp;
-	xfs_log_busy_slot_t	*lbsp;
-	int			i;
-
-	for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
-		i = 0;
-		for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-			if (XFS_LBC_ISFREE(lbcp, i))
-				continue;
-			xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
-		}
-	}
-	xfs_trans_free_busy(tp);
-}
-
 /*
  * This is typically called by the LM when a transaction has been fully
  * committed to disk.  It needs to unpin the items which have
@@ -984,7 +967,6 @@ xfs_trans_committed(
 		kmem_free(licp);
 	}
 
-	xfs_trans_clear_busy_extents(tp);
 	xfs_trans_free(tp);
 }
 
@@ -1012,8 +994,7 @@ xfs_trans_uncommit(
 	xfs_trans_unreserve_and_mod_sb(tp);
 	xfs_trans_unreserve_and_mod_dquots(tp);
 
-	xfs_trans_free_items(tp, flags);
-	xfs_trans_free_busy(tp);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
 	xfs_trans_free(tp);
 }
 
@@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog(
 	*commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
 
 	tp->t_commit_lsn = *commit_lsn;
+	trace_xfs_trans_commit_lsn(tp);
+
 	if (nvec > XFS_TRANS_LOGVEC_COUNT)
 		kmem_free(log_vector);
 
@@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog(
 	return xfs_log_release_iclog(mp, commit_iclog);
 }
 
+/*
+ * Walk the log items and allocate log vector structures for
+ * each item large enough to fit all the vectors they require.
+ * Note that this format differs from the old log vector format in
+ * that there is no transaction header in these log vectors.
+ */
+STATIC struct xfs_log_vec *
+xfs_trans_alloc_log_vecs(
+	xfs_trans_t	*tp)
+{
+	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_vec	*lv = NULL;
+	struct xfs_log_vec	*ret_lv = NULL;
+
+	lidp = xfs_trans_first_item(tp);
+
+	/* Bail out if we didn't find a log item.  */
+	if (!lidp) {
+		ASSERT(0);
+		return NULL;
+	}
+
+	while (lidp != NULL) {
+		struct xfs_log_vec *new_lv;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+
+		/* Skip items that do not have any vectors for writing */
+		lidp->lid_size = IOP_SIZE(lidp->lid_item);
+		if (!lidp->lid_size) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+
+		new_lv = kmem_zalloc(sizeof(*new_lv) +
+				lidp->lid_size * sizeof(struct xfs_log_iovec),
+				KM_SLEEP);
+
+		/* The allocated iovec region lies beyond the log vector. */
+		new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+		new_lv->lv_niovecs = lidp->lid_size;
+		new_lv->lv_item = lidp->lid_item;
+		if (!ret_lv)
+			ret_lv = new_lv;
+		else
+			lv->lv_next = new_lv;
+		lv = new_lv;
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	return ret_lv;
+}
+
+static int
+xfs_trans_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct xfs_log_vec	*log_vector;
+	int			error;
+
+	/*
+	 * Get each log item to allocate a vector structure for
+	 * the log item to to pass to the log write code. The
+	 * CIL commit code will format the vector and save it away.
+	 */
+	log_vector = xfs_trans_alloc_log_vecs(tp);
+	if (!log_vector)
+		return ENOMEM;
+
+	error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+	if (error)
+		return error;
+
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+	/* xfs_trans_free_items() unlocks them first */
+	xfs_trans_free_items(tp, *commit_lsn, 0);
+	xfs_trans_free(tp);
+	return 0;
+}
 
 /*
  * xfs_trans_commit
@@ -1221,7 +1291,11 @@ _xfs_trans_commit(
 		xfs_trans_apply_sb_deltas(tp);
 	xfs_trans_apply_dquot_deltas(tp);
 
-	error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+	if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+		error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+	else
+		error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+
 	if (error == ENOMEM) {
 		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 		error = XFS_ERROR(EIO);
@@ -1259,8 +1333,7 @@ out_unreserve:
 			error = XFS_ERROR(EIO);
 	}
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-	xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0);
-	xfs_trans_free_busy(tp);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
 	xfs_trans_free(tp);
 
 	XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1411,7 @@ xfs_trans_cancel(
 	/* mark this thread as no longer being in a transaction */
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-	xfs_trans_free_items(tp, flags);
-	xfs_trans_free_busy(tp);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
 	xfs_trans_free(tp);
 }
 
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921..8c69e7824f6 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
 #define	XFS_TRANS_GROWFSRT_FREE		39
 #define	XFS_TRANS_SWAPEXT		40
 #define	XFS_TRANS_SB_COUNT		41
-#define	XFS_TRANS_TYPE_MAX		41
+#define	XFS_TRANS_CHECKPOINT		42
+#define	XFS_TRANS_TYPE_MAX		42
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
 #define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
 	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
 	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
 	{ XFS_TRANS_SB_COUNT,		"SB_COUNT" }, \
+	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
 	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
 	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
 	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
@@ -813,6 +815,7 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
+struct xfs_busy_extent;
 
 typedef struct xfs_log_item {
 	struct list_head		li_ail;		/* AIL pointers */
@@ -828,6 +831,11 @@ typedef struct xfs_log_item {
 							/* buffer item iodone */
 							/* callback func */
 	struct xfs_item_ops		*li_ops;	/* function list */
+
+	/* delayed logging */
+	struct list_head		li_cil;		/* CIL pointers */
+	struct xfs_log_vec		*li_lv;		/* active log vector */
+	xfs_lsn_t			li_seq;		/* CIL commit seq */
 } xfs_log_item_t;
 
 #define	XFS_LI_IN_AIL	0x1
@@ -872,34 +880,6 @@ typedef struct xfs_item_ops {
 #define XFS_ITEM_PUSHBUF	3
 
 /*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-
-typedef struct xfs_log_busy_slot {
-	xfs_agnumber_t		lbc_ag;
-	ushort			lbc_idx;	/* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-
-#define XFS_LBC_NUM_SLOTS	31
-typedef struct xfs_log_busy_chunk {
-	struct xfs_log_busy_chunk	*lbc_next;
-	uint				lbc_free;	/* free slots bitmask */
-	ushort				lbc_unused;	/* first unused */
-	xfs_log_busy_slot_t		lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-
-#define	XFS_LBC_MAX_SLOT	(XFS_LBC_NUM_SLOTS - 1)
-#define	XFS_LBC_FREEMASK	((1U << XFS_LBC_NUM_SLOTS) - 1)
-
-#define	XFS_LBC_INIT(cp)	((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define	XFS_LBC_CLAIM(cp, slot)	((cp)->lbc_free &= ~(1 << (slot)))
-#define	XFS_LBC_SLOT(cp, slot)	(&((cp)->lbc_busy[(slot)]))
-#define	XFS_LBC_VACANCY(cp)	(((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define	XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-
-/*
  * This is the type of function which can be given to xfs_trans_callback()
  * to be called upon the transaction's commit to disk.
  */
@@ -950,8 +930,7 @@ typedef struct xfs_trans {
 	unsigned int		t_items_free;	/* log item descs free */
 	xfs_log_item_chunk_t	t_items;	/* first log item desc chunk */
 	xfs_trans_header_t	t_header;	/* header for in-log trans */
-	unsigned int		t_busy_free;	/* busy descs free */
-	xfs_log_busy_chunk_t	t_busy;		/* busy/async free blocks */
+	struct list_head	t_busy;		/* list of busy extents */
 	unsigned long		t_pflags;	/* saved process flags state */
 } xfs_trans_t;
 
@@ -1025,9 +1004,6 @@ int		_xfs_trans_commit(xfs_trans_t *,
 void		xfs_trans_cancel(xfs_trans_t *, int);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
-					xfs_agnumber_t ag,
-					xfs_extlen_t idx);
 
 extern kmem_zone_t	*xfs_trans_zone;
 
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3..63d81a22f4f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
 	xfs_buf_item_init(bp, tp->t_mountp);
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
 	if (reset_recur)
 		bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	/*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bip->bli_flags |= XFS_BLI_HOLD;
 	trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
 	bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 		bip->bli_flags &= ~XFS_BLI_STALE;
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		XFS_BUF_UNSTALE(bp);
-		bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+		bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
 	}
 
 	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
 		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-		ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
 		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
 		return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
 	 * in the buf log item.  The STALE flag will be used in
 	 * xfs_buf_item_unpin() to determine if it should clean up
 	 * when the last reference to the buf item is given up.
-	 * We set the XFS_BLI_CANCEL flag in the buf log format structure
+	 * We set the XFS_BLF_CANCEL flag in the buf log format structure
 	 * and log the buf item.  This will be used at recovery time
 	 * to determine that copies of the buffer in the log before
 	 * this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
 	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_STALE(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
-	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-	bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
-	bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+	bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
 	memset((char *)(bip->bli_format.blf_data_map), 0,
 	      (bip->bli_format.blf_map_size * sizeof(uint)));
 	lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
 }
 
 /*
- * This call is used to indicate that the buffer contains on-disk
- * inodes which must be handled specially during recovery.  They
- * require special handling because only the di_next_unlinked from
- * the inodes in the buffer should be recovered.  The rest of the
- * data in the buffer is logged via the inodes themselves.
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery.  They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered.  The rest of the data in the buffer is logged via the inodes
+ * themselves.
  *
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
- * format structure so that we'll know what to do at recovery time.
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
  */
-/* ARGSUSED */
 void
 xfs_trans_inode_buf(
 	xfs_trans_t	*tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
-	bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+	bip->bli_flags |= XFS_BLI_INODE_BUF;
 }
 
 /*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
 	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-	ASSERT(type == XFS_BLI_UDQUOT_BUF ||
-	       type == XFS_BLI_PDQUOT_BUF ||
-	       type == XFS_BLI_GDQUOT_BUF);
+	ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+	       type == XFS_BLF_PDQUOT_BUF ||
+	       type == XFS_BLF_GDQUOT_BUF);
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9ee..f11d37d06dc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 void
 xfs_trans_free_items(
 	xfs_trans_t	*tp,
+	xfs_lsn_t	commit_lsn,
 	int		flags)
 {
 	xfs_log_item_chunk_t	*licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
 	 * Special case the embedded chunk so we don't free it below.
 	 */
 	if (!xfs_lic_are_all_free(licp)) {
-		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
 		xfs_lic_all_free(licp);
 		licp->lic_unused = 0;
 	}
@@ -322,7 +323,7 @@ xfs_trans_free_items(
 	 */
 	while (licp != NULL) {
 		ASSERT(!xfs_lic_are_all_free(licp));
-		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
 		next_licp = licp->lic_next;
 		kmem_free(licp);
 		licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
 
 	return freed;
 }
-
-
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items.  It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry.  The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
-	xfs_log_busy_chunk_t	*lbcp;
-	xfs_log_busy_slot_t	*lbsp;
-	int			i=0;
-
-	/*
-	 * If there are no free descriptors, allocate a new chunk
-	 * of them and put it at the front of the chunk list.
-	 */
-	if (tp->t_busy_free == 0) {
-		lbcp = (xfs_log_busy_chunk_t*)
-		       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
-		ASSERT(lbcp != NULL);
-		/*
-		 * Initialize the chunk, and then
-		 * claim the first slot in the newly allocated chunk.
-		 */
-		XFS_LBC_INIT(lbcp);
-		XFS_LBC_CLAIM(lbcp, 0);
-		lbcp->lbc_unused = 1;
-		lbsp = XFS_LBC_SLOT(lbcp, 0);
-
-		/*
-		 * Link in the new chunk and update the free count.
-		 */
-		lbcp->lbc_next = tp->t_busy.lbc_next;
-		tp->t_busy.lbc_next = lbcp;
-		tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-
-		/*
-		 * Initialize the descriptor and the generic portion
-		 * of the log item.
-		 *
-		 * Point the new slot at this item and return it.
-		 * Also point the log item at its currently active
-		 * descriptor and set the item's mount pointer.
-		 */
-		lbsp->lbc_ag = ag;
-		lbsp->lbc_idx = idx;
-		return lbsp;
-	}
-
-	/*
-	 * Find the free descriptor. It is somewhere in the chunklist
-	 * of descriptors.
-	 */
-	lbcp = &tp->t_busy;
-	while (lbcp != NULL) {
-		if (XFS_LBC_VACANCY(lbcp)) {
-			if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
-				i = lbcp->lbc_unused;
-				break;
-			} else {
-				/* out-of-order vacancy */
-				cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
-				ASSERT(0);
-			}
-		}
-		lbcp = lbcp->lbc_next;
-	}
-	ASSERT(lbcp != NULL);
-	/*
-	 * If we find a free descriptor, claim it,
-	 * initialize it, and return it.
-	 */
-	XFS_LBC_CLAIM(lbcp, i);
-	if (lbcp->lbc_unused <= i) {
-		lbcp->lbc_unused = i + 1;
-	}
-	lbsp = XFS_LBC_SLOT(lbcp, i);
-	tp->t_busy_free--;
-	lbsp->lbc_ag = ag;
-	lbsp->lbc_idx = idx;
-	return lbsp;
-}
-
-
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
-	xfs_log_busy_chunk_t	*lbcp;
-	xfs_log_busy_chunk_t	*lbcq;
-
-	lbcp = tp->t_busy.lbc_next;
-	while (lbcp != NULL) {
-		lbcq = lbcp->lbc_next;
-		kmem_free(lbcp);
-		lbcp = lbcq;
-	}
-
-	XFS_LBC_INIT(&tp->t_busy);
-	tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad39743..c6e4f2c8de6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
 struct xfs_log_item_desc	*xfs_trans_first_item(struct xfs_trans *);
 struct xfs_log_item_desc	*xfs_trans_next_item(struct xfs_trans *,
 					     struct xfs_log_item_desc *);
-void				xfs_trans_free_items(struct xfs_trans *, int);
-void				xfs_trans_unlock_items(struct xfs_trans *,
-							xfs_lsn_t);
-void				xfs_trans_free_busy(xfs_trans_t *tp);
-xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
-						    xfs_agnumber_t ag,
-						    xfs_extlen_t idx);
+
+void	xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
+void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+				int flags);
+
+void	xfs_trans_item_committed(struct xfs_log_item *lip,
+				xfs_lsn_t commit_lsn, int aborted);
+void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
 /*
  * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d0..320775295e3 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 
 typedef __uint16_t	xfs_prid_t;	/* prid_t truncated to 16bits in XFS */
 
+typedef __uint32_t	xlog_tid_t;	/* transaction ID type */
+
 /*
  * These types are 64 bits on disk but are either 32 or 64 bits in memory.
  * Disk based types: