117 files changed, 1775 insertions, 1597 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 5eaa70c9d96..71072ab9912 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 {
 	unsigned int sz = sizeof(struct bio) + extra_size;
 	struct kmem_cache *slab = NULL;
-	struct bio_slab *bslab;
+	struct bio_slab *bslab, *new_bio_slabs;
 	unsigned int i, entry = -1;
 
 	mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 
 	if (bio_slab_nr == bio_slab_max && entry == -1) {
 		bio_slab_max <<= 1;
-		bio_slabs = krealloc(bio_slabs,
-				     bio_slab_max * sizeof(struct bio_slab),
-				     GFP_KERNEL);
-		if (!bio_slabs)
+		new_bio_slabs = krealloc(bio_slabs,
+					 bio_slab_max * sizeof(struct bio_slab),
+					 GFP_KERNEL);
+		if (!new_bio_slabs)
 			goto out_unlock;
+		bio_slabs = new_bio_slabs;
 	}
 	if (entry == -1)
 		entry = bio_slab_nr++;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e519195d45..38e721b35d4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
+	struct blk_plug plug;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
+	blk_start_plug(&plug);
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a256f3b2a84..ff6475f409d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1438,10 +1438,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 	ret = extent_from_logical(fs_info, logical, path,
 					&found_key);
 	btrfs_release_path(path);
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-		ret = -EINVAL;
 	if (ret < 0)
 		return ret;
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		return -EINVAL;
 
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 86eff48dab7..43d1c5a3a03 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -818,6 +818,7 @@ static void free_workspace(int type, struct list_head *workspace)
 	btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(alloc_workspace);
 wake:
+	smp_mb();
 	if (waitqueue_active(workspace_wait))
 		wake_up(workspace_wait);
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9d7621f271f..6d183f60d63 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -421,12 +421,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 
 	/*
-	 * we removed the lowest blocker from the blocker list, so there may be
-	 * more processible delayed refs.
-	 */
-	wake_up(&fs_info->tree_mod_seq_wait);
-
-	/*
 	 * anything that's lower than the lowest existing (read: blocked)
 	 * sequence number can be removed from the tree.
 	 */
@@ -631,6 +625,9 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 	u32 nritems;
 	int ret;
 
+	if (btrfs_header_level(eb) == 0)
+		return;
+
 	nritems = btrfs_header_nritems(eb);
 	for (i = nritems - 1; i >= 0; i--) {
 		ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4bab807227a..9821b672f5a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -116,7 +116,7 @@ struct btrfs_ordered_sum;
 #define BTRFS_FREE_SPACE_OBJECTID -11ULL
 
 /*
- * The inode number assigned to the special inode for sotring
+ * The inode number assigned to the special inode for storing
  * free ino cache
  */
 #define BTRFS_FREE_INO_OBJECTID -12ULL
@@ -1252,7 +1252,6 @@ struct btrfs_fs_info {
 	atomic_t tree_mod_seq;
 	struct list_head tree_mod_seq_list;
 	struct seq_list tree_mod_seq_elem;
-	wait_queue_head_t tree_mod_seq_wait;
 
 	/* this protects tree_mod_log */
 	rwlock_t tree_mod_log_lock;
@@ -3192,7 +3191,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 logical_offset, u32 *dst);
+			      struct bio *bio, u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 335605c8cea..07d5eeb1e6f 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -512,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 
 	rb_erase(&delayed_item->rb_node, root);
 	delayed_item->delayed_node->count--;
-	atomic_dec(&delayed_root->items);
-	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+	if (atomic_dec_return(&delayed_root->items) <
+	    BTRFS_DELAYED_BACKGROUND &&
 	    waitqueue_active(&delayed_root->wait))
 		wake_up(&delayed_root->wait);
 }
@@ -1028,9 +1028,10 @@ do_again:
 		btrfs_release_delayed_item(prev);
 		ret = 0;
 		btrfs_release_path(path);
-		if (curr)
+		if (curr) {
+			mutex_unlock(&node->mutex);
 			goto do_again;
-		else
+		} else
 			goto delete_fail;
 	}
 
@@ -1055,8 +1056,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
 		delayed_node->count--;
 
 		delayed_root = delayed_node->root->fs_info->delayed_root;
-		atomic_dec(&delayed_root->items);
-		if (atomic_read(&delayed_root->items) <
+		if (atomic_dec_return(&delayed_root->items) <
 		    BTRFS_DELAYED_BACKGROUND &&
 		    waitqueue_active(&delayed_root->wait))
 			wake_up(&delayed_root->wait);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index da7419ed01b..ae941177339 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -38,17 +38,14 @@
 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
 			  struct btrfs_delayed_tree_ref *ref1)
 {
-	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
-		if (ref1->root < ref2->root)
-			return -1;
-		if (ref1->root > ref2->root)
-			return 1;
-	} else {
-		if (ref1->parent < ref2->parent)
-			return -1;
-		if (ref1->parent > ref2->parent)
-			return 1;
-	}
+	if (ref1->root < ref2->root)
+		return -1;
+	if (ref1->root > ref2->root)
+		return 1;
+	if (ref1->parent < ref2->parent)
+		return -1;
+	if (ref1->parent > ref2->parent)
+		return 1;
 	return 0;
 }
 
@@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
  * type of the delayed backrefs and content of delayed backrefs.
  */
 static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1)
+		      struct btrfs_delayed_ref_node *ref1,
+		      bool compare_seq)
 {
 	if (ref1->bytenr < ref2->bytenr)
 		return -1;
@@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 	if (ref1->type > ref2->type)
 		return 1;
 	/* merging of sequenced refs is not allowed */
-	if (ref1->seq < ref2->seq)
-		return -1;
-	if (ref1->seq > ref2->seq)
-		return 1;
+	if (compare_seq) {
+		if (ref1->seq < ref2->seq)
+			return -1;
+		if (ref1->seq > ref2->seq)
+			return 1;
+	}
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, ins);
+		cmp = comp_entry(entry, ins, 1);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -233,6 +233,114 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_root *delayed_refs,
+				    struct btrfs_delayed_ref_node *ref)
+{
+	rb_erase(&ref->rb_node, &delayed_refs->root);
+	ref->in_tree = 0;
+	btrfs_put_delayed_ref(ref);
+	delayed_refs->num_entries--;
+	if (trans->delayed_ref_updates)
+		trans->delayed_ref_updates--;
+}
+
+static int merge_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_root *delayed_refs,
+		     struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+	struct rb_node *node;
+	int merged = 0;
+	int mod = 0;
+	int done = 0;
+
+	node = rb_prev(&ref->rb_node);
+	while (node) {
+		struct btrfs_delayed_ref_node *next;
+
+		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_prev(node);
+		if (next->bytenr != ref->bytenr)
+			break;
+		if (seq && next->seq >= seq)
+			break;
+		if (comp_entry(ref, next, 0))
+			continue;
+
+		if (ref->action == next->action) {
+			mod = next->ref_mod;
+		} else {
+			if (ref->ref_mod < next->ref_mod) {
+				struct btrfs_delayed_ref_node *tmp;
+
+				tmp = ref;
+				ref = next;
+				next = tmp;
+				done = 1;
+			}
+			mod = -next->ref_mod;
+		}
+
+		merged++;
+		drop_delayed_ref(trans, delayed_refs, next);
+		ref->ref_mod += mod;
+		if (ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, ref);
+			break;
+		} else {
+			/*
+			 * You can't have multiples of the same ref on a tree
+			 * block.
+			 */
+			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+
+		if (done)
+			break;
+		node = rb_prev(&ref->rb_node);
+	}
+
+	return merged;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	u64 seq = 0;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	node = rb_prev(&head->node.rb_node);
+	while (node) {
+		struct btrfs_delayed_ref_node *ref;
+
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+			       rb_node);
+		if (ref->bytenr != head->node.bytenr)
+			break;
+
+		/* We can't merge refs that are outside of our seq count */
+		if (seq && ref->seq >= seq)
+			break;
+		if (merge_ref(trans, delayed_refs, ref, seq))
+			node = rb_prev(&head->node.rb_node);
+		else
+			node = rb_prev(node);
+	}
+}
+
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
@@ -336,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 		 * every changing the extent allocation tree.
 		 */
 		existing->ref_mod--;
-		if (existing->ref_mod == 0) {
-			rb_erase(&existing->rb_node,
-				 &delayed_refs->root);
-			existing->in_tree = 0;
-			btrfs_put_delayed_ref(existing);
-			delayed_refs->num_entries--;
-			if (trans->delayed_ref_updates)
-				trans->delayed_ref_updates--;
-		} else {
+		if (existing->ref_mod == 0)
+			drop_delayed_ref(trans, delayed_refs, existing);
+		else
 			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		}
 	} else {
 		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -662,9 +763,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, level, action,
 				   for_cow);
-	if (!need_ref_seq(for_cow, ref_root) &&
-	    waitqueue_active(&fs_info->tree_mod_seq_wait))
-		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	if (need_ref_seq(for_cow, ref_root))
 		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
@@ -713,9 +811,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
 				   action, for_cow);
-	if (!need_ref_seq(for_cow, ref_root) &&
-	    waitqueue_active(&fs_info->tree_mod_seq_wait))
-		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	if (need_ref_seq(for_cow, ref_root))
 		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
@@ -744,8 +839,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
 
-	if (waitqueue_active(&fs_info->tree_mod_seq_wait))
-		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 0d7c90c366b..c9d703693df 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,7 +18,7 @@
 #ifndef __DELAYED_REF__
 #define __DELAYED_REF__
 
-/* these are the possible values of struct btrfs_delayed_ref->action */
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
 #define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
 #define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
@@ -167,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62e0cafd6e2..22e98e04c2e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -377,9 +377,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		ret = read_extent_buffer_pages(io_tree, eb, start,
 					       WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
-		if (!ret && !verify_parent_transid(io_tree, eb,
+		if (!ret) {
+			if (!verify_parent_transid(io_tree, eb,
 						   parent_transid, 0))
-			break;
+				break;
+			else
+				ret = -EIO;
+		}
 
 		/*
 		 * This buffer's crc is fine, but its contents are corrupted, so
@@ -754,9 +758,7 @@ static void run_one_async_done(struct btrfs_work *work)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
-	atomic_dec(&fs_info->nr_async_submits);
-
-	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
@@ -2032,8 +2034,6 @@ int open_ctree(struct super_block *sb,
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
 
-	init_waitqueue_head(&fs_info->tree_mod_seq_wait);
-
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
 	spin_lock_init(&fs_info->reada_lock);
@@ -2528,8 +2528,7 @@ retry_root_backup:
 		goto fail_trans_kthread;
 
 	/* do not make disk changes in broken FS */
-	if (btrfs_super_log_root(disk_super) != 0 &&
-	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+	if (btrfs_super_log_root(disk_super) != 0) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
@@ -3189,30 +3188,14 @@ int close_ctree(struct btrfs_root *root)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_run_defrag_inodes(fs_info);
 
-	/*
-	 * Here come 2 situations when btrfs is broken to flip readonly:
-	 *
-	 * 1. when btrfs flips readonly somewhere else before
-	 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
-	 * and btrfs will skip to write sb directly to keep
-	 * ERROR state on disk.
-	 *
-	 * 2. when btrfs flips readonly just in btrfs_commit_super,
-	 * and in such case, btrfs cannot write sb via btrfs_commit_super,
-	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
-	 * btrfs will cleanup all FS resources first and write sb then.
-	 */
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_commit_super(root);
 		if (ret)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-		ret = btrfs_error_commit_super(root);
-		if (ret)
-			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
-	}
+	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+		btrfs_error_commit_super(root);
 
 	btrfs_put_block_group_cache(fs_info);
 
@@ -3434,18 +3417,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	if (read_only)
 		return 0;
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-		printk(KERN_WARNING "warning: mount fs with errors, "
-		       "running btrfsck is recommended\n");
-	}
-
 	return 0;
 }
 
-int btrfs_error_commit_super(struct btrfs_root *root)
+void btrfs_error_commit_super(struct btrfs_root *root)
 {
-	int ret;
-
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3455,10 +3431,6 @@ int btrfs_error_commit_super(struct btrfs_root *root)
 
 	/* cleanup FS via transaction */
 	btrfs_cleanup_transaction(root);
-
-	ret = write_ctree_super(NULL, root, 0);
-
-	return ret;
 }
 
 static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
@@ -3782,14 +3754,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 		/* FIXME: cleanup wait for commit */
 		t->in_commit = 1;
 		t->blocked = 1;
+		smp_mb();
 		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
 			wake_up(&root->fs_info->transaction_blocked_wait);
 
 		t->blocked = 0;
+		smp_mb();
 		if (waitqueue_active(&root->fs_info->transaction_wait))
 			wake_up(&root->fs_info->transaction_wait);
 
 		t->commit_done = 1;
+		smp_mb();
 		if (waitqueue_active(&t->commit_wait))
 			wake_up(&t->commit_wait);
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95e147eea23..c5b00a735fe 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-int btrfs_error_commit_super(struct btrfs_root *root);
+void btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e1b153b7c4..ba58024d40d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2252,6 +2252,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 
 		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 */
+		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+					 locked_ref);
+
+		/*
 		 * locked_ref is the head node, so we have to go one
 		 * node back for any delayed ref updates
 		 */
@@ -2318,12 +2328,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-		/*
-		 * we modified num_entries, but as we're currently running
-		 * delayed refs, skip
-		 *     wake_up(&delayed_refs->seq_wait);
-		 * here.
-		 */
+		if (locked_ref) {
+			/*
+			 * when we play the delayed ref, also correct the
+			 * ref_mod on head
+			 */
+			switch (ref->action) {
+			case BTRFS_ADD_DELAYED_REF:
+			case BTRFS_ADD_DELAYED_EXTENT:
+				locked_ref->node.ref_mod -= ref->ref_mod;
+				break;
+			case BTRFS_DROP_DELAYED_REF:
+				locked_ref->node.ref_mod += ref->ref_mod;
+				break;
+			default:
+				WARN_ON(1);
+			}
+		}
 		spin_unlock(&delayed_refs->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2350,22 +2371,6 @@ next:
 	return count;
 }
 
-static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
-			       struct btrfs_delayed_ref_root *delayed_refs,
-			       unsigned long num_refs,
-			       struct list_head *first_seq)
-{
-	spin_unlock(&delayed_refs->lock);
-	pr_debug("waiting for more refs (num %ld, first %p)\n",
-		 num_refs, first_seq);
-	wait_event(fs_info->tree_mod_seq_wait,
-		   num_refs != delayed_refs->num_entries ||
-		   fs_info->tree_mod_seq_list.next != first_seq);
-	pr_debug("done waiting for more refs (num %ld, first %p)\n",
-		 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
-	spin_lock(&delayed_refs->lock);
-}
-
 #ifdef SCRAMBLE_DELAYED_REFS
 /*
  * Normally delayed refs get processed in ascending bytenr order. This
@@ -2460,13 +2465,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct list_head cluster;
-	struct list_head *first_seq = NULL;
 	int ret;
 	u64 delayed_start;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
-	unsigned long num_refs = 0;
-	int consider_waiting;
+	int loops;
 
 	/* We'll clean this up in btrfs_cleanup_transaction */
 	if (trans->aborted)
@@ -2484,7 +2487,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
-	consider_waiting = 0;
+	loops = 0;
 	spin_lock(&delayed_refs->lock);
 
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -2512,31 +2515,6 @@ again:
 		if (ret)
 			break;
 
-		if (delayed_start >= delayed_refs->run_delayed_start) {
-			if (consider_waiting == 0) {
-				/*
-				 * btrfs_find_ref_cluster looped. let's do one
-				 * more cycle. if we don't run any delayed ref
-				 * during that cycle (because we can't because
-				 * all of them are blocked) and if the number of
-				 * refs doesn't change, we avoid busy waiting.
-				 */
-				consider_waiting = 1;
-				num_refs = delayed_refs->num_entries;
-				first_seq = root->fs_info->tree_mod_seq_list.next;
-			} else {
-				wait_for_more_refs(root->fs_info, delayed_refs,
-						   num_refs, first_seq);
-				/*
-				 * after waiting, things have changed. we
-				 * dropped the lock and someone else might have
-				 * run some refs, built new clusters and so on.
-				 * therefore, we restart staleness detection.
-				 */
-				consider_waiting = 0;
-			}
-		}
-
 		ret = run_clustered_refs(trans, root, &cluster);
 		if (ret < 0) {
 			spin_unlock(&delayed_refs->lock);
@@ -2549,9 +2527,26 @@ again:
 		if (count == 0)
 			break;
 
-		if (ret || delayed_refs->run_delayed_start == 0) {
+		if (delayed_start >= delayed_refs->run_delayed_start) {
+			if (loops == 0) {
+				/*
+				 * btrfs_find_ref_cluster looped. let's do one
+				 * more cycle. if we don't run any delayed ref
+				 * during that cycle (because we can't because
+				 * all of them are blocked), bail out.
+				 */
+				loops = 1;
+			} else {
+				/*
+				 * no runnable refs left, stop trying
+				 */
+				BUG_ON(run_all);
+				break;
+			}
+		}
+		if (ret) {
 			/* refs were run, let's reset staleness detection */
-			consider_waiting = 0;
+			loops = 0;
 		}
 	}
 
@@ -3007,17 +3002,16 @@ again:
 	}
 	spin_unlock(&block_group->lock);
 
-	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+	/*
+	 * Try to preallocate enough space based on how big the block group is.
+	 * Keep in mind this has to include any pinned space which could end up
+	 * taking up quite a bit since it's not folded into the other space
+	 * cache.
+	 */
+	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
 	if (!num_pages)
 		num_pages = 1;
 
-	/*
-	 * Just to make absolutely sure we have enough space, we're going to
-	 * preallocate 12 pages worth of space for each block group.  In
-	 * practice we ought to use at most 8, but we need extra space so we can
-	 * add our header and have a terminator between the extents and the
-	 * bitmaps.
-	 */
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
@@ -4571,8 +4565,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	if (root->fs_info->quota_enabled) {
 		ret = btrfs_qgroup_reserve(root, num_bytes +
 					   nr_extents * root->leafsize);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 			return ret;
+		}
 	}
 
 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -5294,9 +5290,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
 
 	delayed_refs->num_entries--;
-	smp_mb();
-	if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
-		wake_up(&root->fs_info->tree_mod_seq_wait);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 45c81bb4ac8..4c878476bb9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2330,23 +2330,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
 							      state, mirror);
-			if (ret) {
-				/* no IO indicated but software detected errors
-				 * in the block, either checksum errors or
-				 * issues with the contents */
-				struct btrfs_root *root =
-					BTRFS_I(page->mapping->host)->root;
-				struct btrfs_device *device;
-
+			if (ret)
 				uptodate = 0;
-				device = btrfs_find_device_for_logical(
-						root, start, mirror);
-				if (device)
-					btrfs_dev_stat_inc_and_print(device,
-						BTRFS_DEV_STAT_CORRUPTION_ERRS);
-			} else {
+			else
 				clean_io_failure(start, page);
-			}
 		}
 
 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b45b9de0c21..857d93cd01d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -272,9 +272,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 offset, u32 *dst)
+			      struct bio *bio, u64 offset)
 {
-	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
 }
 
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6e8f416773d..316b07a866d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1008,9 +1008,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
 		PAGE_CACHE_SHIFT;
 
-	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
-
-	if (atomic_read(&root->fs_info->async_delalloc_pages) <
+	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
 	    5 * 1024 * 1024 &&
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
@@ -1885,8 +1883,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 				trans = btrfs_join_transaction_nolock(root);
 			else
 				trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
-				return PTR_ERR(trans);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				trans = NULL;
+				goto out;
+			}
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode_fallback(trans, root, inode);
 			if (ret) /* -ENOMEM or corruption */
@@ -1970,8 +1971,8 @@ out:
 				      ordered_extent->len - 1, NULL, GFP_NOFS);
 
 	/*
-	 * This needs to be dont to make sure anybody waiting knows we are done
-	 * upating everything for this ordered extent.
+	 * This needs to be done to make sure anybody waiting knows we are done
+	 * updating everything for this ordered extent.
 	 */
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
@@ -3174,7 +3175,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
 	inode_inc_iversion(dir);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	ret = btrfs_update_inode(trans, root, dir);
+	ret = btrfs_update_inode_fallback(trans, root, dir);
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 out:
@@ -5774,18 +5775,112 @@ out:
 	return ret;
 }
 
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+			      struct extent_state **cached_state, int writing)
+{
+	struct btrfs_ordered_extent *ordered;
+	int ret = 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, cached_state);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+
+		/*
+		 * We need to make sure there are no buffered pages in this
+		 * range either, we could have raced between the invalidate in
+		 * generic_file_direct_write and locking the extent.  The
+		 * invalidate needs to happen so that reads after a write do not
+		 * get stale data.
+		 */
+		if (!ordered && (!writing ||
+		    !test_range_bit(&BTRFS_I(inode)->io_tree,
+				    lockstart, lockend, EXTENT_UPTODATE, 0,
+				    *cached_state)))
+			break;
+
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     cached_state, GFP_NOFS);
+
+		if (ordered) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+		} else {
+			/* Screw you mmap */
+			ret = filemap_write_and_wait_range(inode->i_mapping,
+							   lockstart,
+							   lockend);
+			if (ret)
+				break;
+
+			/*
+			 * If we found a page that couldn't be invalidated just
+			 * fall back to buffered.
+			 */
+			ret = invalidate_inode_pages2_range(inode->i_mapping,
+					lockstart >> PAGE_CACHE_SHIFT,
+					lockend >> PAGE_CACHE_SHIFT);
+			if (ret)
+				break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
 	u64 start = iblock << inode->i_blkbits;
+	u64 lockstart, lockend;
 	u64 len = bh_result->b_size;
 	struct btrfs_trans_handle *trans;
+	int unlock_bits = EXTENT_LOCKED;
+	int ret;
+
+	if (create) {
+		ret = btrfs_delalloc_reserve_space(inode, len);
+		if (ret)
+			return ret;
+		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+	} else {
+		len = min_t(u64, len, root->sectorsize);
+	}
+
+	lockstart = start;
+	lockend = start + len - 1;
+
+	/*
+	 * If this errors out it's because we couldn't invalidate pagecache for
+	 * this range and we need to fallback to buffered.
+	 */
+	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+		return -ENOTBLK;
+
+	if (create) {
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, EXTENT_DELALLOC, NULL,
+				     &cached_state, GFP_NOFS);
+		if (ret)
+			goto unlock_err;
+	}
 
 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
 
 	/*
 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5804,17 +5899,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
 	    em->block_start == EXTENT_MAP_INLINE) {
 		free_extent_map(em);
-		return -ENOTBLK;
+		ret = -ENOTBLK;
+		goto unlock_err;
 	}
 
 	/* Just a good old fashioned hole, return */
 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 		free_extent_map(em);
-		/* DIO will do one hole at a time, so just unlock a sector */
-		unlock_extent(&BTRFS_I(inode)->io_tree, start,
-			      start + root->sectorsize - 1);
-		return 0;
+		ret = 0;
+		goto unlock_err;
 	}
 
 	/*
@@ -5827,8 +5921,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	 *
 	 */
 	if (!create) {
-		len = em->len - (start - em->start);
-		goto map;
+		len = min(len, em->len - (start - em->start));
+		lockstart = start + len;
+		goto unlock;
 	}
 
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5860,7 +5955,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 			btrfs_end_transaction(trans, root);
 			if (ret) {
 				free_extent_map(em);
-				return ret;
+				goto unlock_err;
 			}
 			goto unlock;
 		}
@@ -5873,14 +5968,12 @@ must_cow:
 	 */
 	len = bh_result->b_size;
 	em = btrfs_new_extent_direct(inode, em, start, len);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
 	len = min(len, em->len - (start - em->start));
 unlock:
-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
-			  0, NULL, GFP_NOFS);
-map:
 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
 		inode->i_blkbits;
 	bh_result->b_size = len;
@@ -5898,9 +5991,44 @@ map:
 			i_size_write(inode, start + len);
 	}
 
+	/*
+	 * In the case of write we need to clear and unlock the entire range,
+	 * in the case of read we need to unlock only the end area that we
+	 * aren't using if there is any left over space.
+	 */
+	if (lockstart < lockend) {
+		if (create && len < lockend - lockstart) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+					 lockstart + len - 1, unlock_bits, 1, 0,
+					 &cached_state, GFP_NOFS);
+			/*
+			 * Beside unlock, we also need to cleanup reserved space
+			 * for the left range by attaching EXTENT_DO_ACCOUNTING.
+			 */
+			clear_extent_bit(&BTRFS_I(inode)->io_tree,
+					 lockstart + len, lockend,
+					 unlock_bits | EXTENT_DO_ACCOUNTING,
+					 1, 0, NULL, GFP_NOFS);
+		} else {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+					 lockend, unlock_bits, 1, 0,
+					 &cached_state, GFP_NOFS);
+		}
+	} else {
+		free_extent_state(cached_state);
+	}
+
 	free_extent_map(em);
 
 	return 0;
+
+unlock_err:
+	if (create)
+		unlock_bits |= EXTENT_DO_ACCOUNTING;
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+	return ret;
 }
 
 struct btrfs_dio_private {
@@ -5908,7 +6036,6 @@ struct btrfs_dio_private {
 	u64 logical_offset;
 	u64 disk_bytenr;
 	u64 bytes;
-	u32 *csums;
 	void *private;
 
 	/* number of bios pending for this dio */
@@ -5928,7 +6055,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 start;
-	u32 *private = dip->csums;
 
 	start = dip->logical_offset;
 	do {
@@ -5936,8 +6062,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			struct page *page = bvec->bv_page;
 			char *kaddr;
 			u32 csum = ~(u32)0;
+			u64 private = ~(u32)0;
 			unsigned long flags;
 
+			if (get_state_private(&BTRFS_I(inode)->io_tree,
+					      start, &private))
+				goto failed;
 			local_irq_save(flags);
 			kaddr = kmap_atomic(page);
 			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5947,18 +6077,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			local_irq_restore(flags);
 
 			flush_dcache_page(bvec->bv_page);
-			if (csum != *private) {
+			if (csum != private) {
+failed:
 				printk(KERN_ERR "btrfs csum failed ino %llu off"
 				      " %llu csum %u private %u\n",
 				      (unsigned long long)btrfs_ino(inode),
 				      (unsigned long long)start,
-				      csum, *private);
+				      csum, (unsigned)private);
 				err = -EIO;
 			}
 		}
 
 		start += bvec->bv_len;
-		private++;
 		bvec++;
 	} while (bvec <= bvec_end);
 
@@ -5966,7 +6096,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 		      dip->logical_offset + dip->bytes - 1);
 	bio->bi_private = dip->private;
 
-	kfree(dip->csums);
 	kfree(dip);
 
 	/* If we had a csum failure make sure to clear the uptodate flag */
@@ -6072,7 +6201,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 					 int rw, u64 file_offset, int skip_sum,
-					 u32 *csums, int async_submit)
+					 int async_submit)
 {
 	int write = rw & REQ_WRITE;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6105,8 +6234,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 		if (ret)
 			goto err;
 	} else if (!skip_sum) {
-		ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
-					  file_offset, csums);
+		ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
 		if (ret)
 			goto err;
 	}
@@ -6132,10 +6260,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	u64 submit_len = 0;
 	u64 map_length;
 	int nr_pages = 0;
-	u32 *csums = dip->csums;
 	int ret = 0;
 	int async_submit = 0;
-	int write = rw & REQ_WRITE;
 
 	map_length = orig_bio->bi_size;
 	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -6171,16 +6297,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			atomic_inc(&dip->pending_bios);
 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
 						     file_offset, skip_sum,
-						     csums, async_submit);
+						     async_submit);
 			if (ret) {
 				bio_put(bio);
 				atomic_dec(&dip->pending_bios);
 				goto out_err;
 			}
 
-			/* Write's use the ordered csums */
-			if (!write && !skip_sum)
-				csums = csums + nr_pages;
 			start_sector += submit_len >> 9;
 			file_offset += submit_len;
 
@@ -6210,7 +6333,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
 submit:
 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-				     csums, async_submit);
+				     async_submit);
 	if (!ret)
 		return 0;
 
@@ -6246,17 +6369,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 		ret = -ENOMEM;
 		goto free_ordered;
 	}
-	dip->csums = NULL;
-
-	/* Write's use the ordered csum stuff, so we don't need dip->csums */
-	if (!write && !skip_sum) {
-		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
-		if (!dip->csums) {
-			kfree(dip);
-			ret = -ENOMEM;
-			goto free_ordered;
-		}
-	}
 
 	dip->private = bio->bi_private;
 	dip->inode = inode;
@@ -6341,132 +6453,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
 out:
 	return retval;
 }
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_state *cached_state = NULL;
-	u64 lockstart, lockend;
-	ssize_t ret;
-	int writing = rw & WRITE;
-	int write_bits = 0;
-	size_t count = iov_length(iov, nr_segs);
 
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-			    offset, nr_segs)) {
+			    offset, nr_segs))
 		return 0;
-	}
-
-	lockstart = offset;
-	lockend = offset + count - 1;
-
-	if (writing) {
-		ret = btrfs_delalloc_reserve_space(inode, count);
-		if (ret)
-			goto out;
-	}
-
-	while (1) {
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 0, &cached_state);
-		/*
-		 * We're concerned with the entire range that we're going to be
-		 * doing DIO to, so we need to make sure theres no ordered
-		 * extents in this range.
-		 */
-		ordered = btrfs_lookup_ordered_range(inode, lockstart,
-						     lockend - lockstart + 1);
-
-		/*
-		 * We need to make sure there are no buffered pages in this
-		 * range either, we could have raced between the invalidate in
-		 * generic_file_direct_write and locking the extent.  The
-		 * invalidate needs to happen so that reads after a write do not
-		 * get stale data.
-		 */
-		if (!ordered && (!writing ||
-		    !test_range_bit(&BTRFS_I(inode)->io_tree,
-				    lockstart, lockend, EXTENT_UPTODATE, 0,
-				    cached_state)))
-			break;
-
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     &cached_state, GFP_NOFS);
-
-		if (ordered) {
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-		} else {
-			/* Screw you mmap */
-			ret = filemap_write_and_wait_range(file->f_mapping,
-							   lockstart,
-							   lockend);
-			if (ret)
-				goto out;
-
-			/*
-			 * If we found a page that couldn't be invalidated just
-			 * fall back to buffered.
-			 */
-			ret = invalidate_inode_pages2_range(file->f_mapping,
-					lockstart >> PAGE_CACHE_SHIFT,
-					lockend >> PAGE_CACHE_SHIFT);
-			if (ret) {
-				if (ret == -EBUSY)
-					ret = 0;
-				goto out;
-			}
-		}
-
-		cond_resched();
-	}
 
-	/*
-	 * we don't use btrfs_set_extent_delalloc because we don't want
-	 * the dirty or uptodate bits
-	 */
-	if (writing) {
-		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
-		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     EXTENT_DELALLOC, NULL, &cached_state,
-				     GFP_NOFS);
-		if (ret) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockend, EXTENT_LOCKED | write_bits,
-					 1, 0, &cached_state, GFP_NOFS);
-			goto out;
-		}
-	}
-
-	free_extent_state(cached_state);
-	cached_state = NULL;
-
-	ret = __blockdev_direct_IO(rw, iocb, inode,
+	return __blockdev_direct_IO(rw, iocb, inode,
 		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
 		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
 		   btrfs_submit_direct, 0);
-
-	if (ret < 0 && ret != -EIOCBQUEUED) {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-			      offset + iov_length(iov, nr_segs) - 1,
-			      EXTENT_LOCKED | write_bits, 1, 0,
-			      &cached_state, GFP_NOFS);
-	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
-		/*
-		 * We're falling back to buffered, unlock the section we didn't
-		 * do IO on.
-		 */
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
-			      offset + iov_length(iov, nr_segs) - 1,
-			      EXTENT_LOCKED | write_bits, 1, 0,
-			      &cached_state, GFP_NOFS);
-	}
-out:
-	free_extent_state(cached_state);
-	return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7bb755677a2..9df50fa8a07 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -424,7 +424,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	uuid_le_gen(&new_uuid);
 	memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
 	root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
-	root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec);
+	root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec);
 	root_item.ctime = root_item.otime;
 	btrfs_set_root_ctransid(&root_item, trans->transid);
 	btrfs_set_root_otransid(&root_item, trans->transid);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a44eff07480..2a1762c6604 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -67,7 +67,7 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
 	if (eb->lock_nested) {
 		read_lock(&eb->lock);
-		if (&eb->lock_nested && current->pid == eb->lock_owner) {
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
 			read_unlock(&eb->lock);
 			return;
 		}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bc424ae5a81..b6501558174 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1364,8 +1364,10 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	spin_lock(&fs_info->qgroup_lock);
 
 	dstgroup = add_qgroup_rb(fs_info, objectid);
-	if (!dstgroup)
+	if (IS_ERR(dstgroup)) {
+		ret = PTR_ERR(dstgroup);
 		goto unlock;
+	}
 
 	if (srcid) {
 		srcgroup = find_qgroup_rb(fs_info, srcid);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6bb465cca20..10d8e4d8807 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -544,8 +544,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 	struct timespec ct = CURRENT_TIME;
 
 	spin_lock(&root->root_times_lock);
-	item->ctransid = trans->transid;
+	item->ctransid = cpu_to_le64(trans->transid);
 	item->ctime.sec = cpu_to_le64(ct.tv_sec);
-	item->ctime.nsec = cpu_to_le64(ct.tv_nsec);
+	item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
 	spin_unlock(&root->root_times_lock);
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f2eb24c477a..83d6f9f9c22 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -838,7 +838,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root = fs_info->tree_root;
-	int ret;
 
 	trace_btrfs_sync_fs(wait);
 
@@ -849,11 +848,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	btrfs_wait_ordered_extents(root, 0, 0);
 
-	trans = btrfs_start_transaction(root, 0);
+	spin_lock(&fs_info->trans_lock);
+	if (!fs_info->running_transaction) {
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
+	}
+	spin_unlock(&fs_info->trans_lock);
+
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	ret = btrfs_commit_transaction(trans, root);
-	return ret;
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1530,6 +1535,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 	while (cur_devices) {
 		head = &cur_devices->devices;
 		list_for_each_entry(dev, head, dev_list) {
+			if (dev->missing)
+				continue;
 			if (!first_dev || dev->devid < first_dev->devid)
 				first_dev = dev;
 		}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 17be3dedacb..27c26004e05 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1031,6 +1031,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
 					 dentry->d_name.len * 2);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	if (ret)
 		goto abort_trans_dput;
@@ -1066,7 +1067,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
 			BTRFS_UUID_SIZE);
 	new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
-	new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
+	new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
 	btrfs_set_root_otransid(new_root_item, trans->transid);
 	memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
 	memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e86ae04abe6..88b969aeeb7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -227,9 +227,8 @@ loop_lock:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&fs_info->nr_async_bios);
 
-		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
@@ -569,9 +568,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		memcpy(new_device, device, sizeof(*new_device));
 
 		/* Safe because we are under uuid_mutex */
-		name = rcu_string_strdup(device->name->str, GFP_NOFS);
-		BUG_ON(device->name && !name); /* -ENOMEM */
-		rcu_assign_pointer(new_device->name, name);
+		if (device->name) {
+			name = rcu_string_strdup(device->name->str, GFP_NOFS);
+			BUG_ON(device->name && !name); /* -ENOMEM */
+			rcu_assign_pointer(new_device->name, name);
+		}
 		new_device->bdev = NULL;
 		new_device->writeable = 0;
 		new_device->in_fs_metadata = 0;
@@ -4605,28 +4606,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	return ret;
 }
 
-struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
-						   u64 logical, int mirror_num)
-{
-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
-	int ret;
-	u64 map_length = 0;
-	struct btrfs_bio *bbio = NULL;
-	struct btrfs_device *device;
-
-	BUG_ON(mirror_num == 0);
-	ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
-			      mirror_num);
-	if (ret) {
-		BUG_ON(bbio != NULL);
-		return NULL;
-	}
-	BUG_ON(mirror_num != bbio->mirror_num);
-	device = bbio->stripes[mirror_num - 1].dev;
-	kfree(bbio);
-	return device;
-}
-
 int btrfs_read_chunk_tree(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5479325987b..53c06af92e8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -289,8 +289,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
-struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
-						   u64 logical, int mirror_num);
 void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
 int btrfs_get_dev_stats(struct btrfs_root *root,
diff --git a/fs/buffer.c b/fs/buffer.c
index 9f6d2e41281..58e2e7b7737 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 /*
  * Initialise the state of a blockdev page's buffers.
  */ 
-static void
+static sector_t
 init_page_buffers(struct page *page, struct block_device *bdev,
 			sector_t block, int size)
 {
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
 		block++;
 		bh = bh->b_this_page;
 	} while (bh != head);
+
+	/*
+	 * Caller needs to validate requested block against end of device.
+	 */
+	return end_block;
 }
 
 /*
  * Create the page-cache page that contains the requested block.
  *
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
  */
-static struct page *
+static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-		pgoff_t index, int size)
+		pgoff_t index, int size, int sizebits)
 {
 	struct inode *inode = bdev->bd_inode;
 	struct page *page;
 	struct buffer_head *bh;
+	sector_t end_block;
+	int ret = 0;		/* Will call free_more_memory() */
 
 	page = find_or_create_page(inode->i_mapping, index,
 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
 	if (!page)
-		return NULL;
+		return ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (page_has_buffers(page)) {
 		bh = page_buffers(page);
 		if (bh->b_size == size) {
-			init_page_buffers(page, bdev, block, size);
-			return page;
+			end_block = init_page_buffers(page, bdev,
+						index << sizebits, size);
+			goto done;
 		}
 		if (!try_to_free_buffers(page))
 			goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	 */
 	spin_lock(&inode->i_mapping->private_lock);
 	link_dev_buffers(page, bh);
-	init_page_buffers(page, bdev, block, size);
+	end_block = init_page_buffers(page, bdev, index << sizebits, size);
 	spin_unlock(&inode->i_mapping->private_lock);
-	return page;
-
+done:
+	ret = (block < end_block) ? 1 : -ENXIO;
 failed:
 	unlock_page(page);
 	page_cache_release(page);
-	return NULL;
+	return ret;
 }
 
 /*
@@ -999,7 +1007,6 @@ failed:
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
-	struct page *page;
 	pgoff_t index;
 	int sizebits;
 
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
 			bdevname(bdev, b));
 		return -EIO;
 	}
-	block = index << sizebits;
+
 	/* Create a page with the proper size buffers.. */
-	page = grow_dev_page(bdev, block, index, size);
-	if (!page)
-		return 0;
-	unlock_page(page);
-	page_cache_release(page);
-	return 1;
+	return grow_dev_page(bdev, block, index, size, sizebits);
 }
 
 static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
-	int ret;
-	struct buffer_head *bh;
-
 	/* Size must be multiple of hard sectorsize */
 	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 		return NULL;
 	}
 
-retry:
-	bh = __find_get_block(bdev, block, size);
-	if (bh)
-		return bh;
+	for (;;) {
+		struct buffer_head *bh;
+		int ret;
 
-	ret = grow_buffers(bdev, block, size);
-	if (ret == 0) {
-		free_more_memory();
-		goto retry;
-	} else if (ret > 0) {
 		bh = __find_get_block(bdev, block, size);
 		if (bh)
 			return bh;
+
+		ret = grow_buffers(bdev, block, size);
+		if (ret < 0)
+			return NULL;
+		if (ret == 0)
+			free_more_memory();
 	}
-	return NULL;
 }
 
 /*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
  * which corresponds to the passed block_device, block and size. The
  * returned buffer has its reference count incremented.
  *
- * __getblk() cannot fail - it just keeps trying.  If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block.  Very weird.
- *
  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
  * attempt is failing.  FIXME, perhaps?
  */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb962efdace..6d59006bfa2 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -201,6 +201,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	int err = -ENOMEM;
 
 	dout("ceph_fs_debugfs_init\n");
+	BUG_ON(!fsc->client->debugfs_dir);
 	fsc->debugfs_congestion_kb =
 		debugfs_create_file("writeback_congestion_kb",
 				    0600,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9fff9f3b17e..4b5762ef7c2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	if (rinfo->head->is_dentry) {
 		struct inode *dir = req->r_locked_dir;
 
-		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
-				 session, req->r_request_started, -1,
-				 &req->r_caps_reservation);
-		if (err < 0)
-			return err;
+		if (dir) {
+			err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1,
+					 &req->r_caps_reservation);
+			if (err < 0)
+				return err;
+		} else {
+			WARN_ON_ONCE(1);
+		}
 	}
 
 	/*
@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	 * will have trouble splicing in the virtual snapdir later
 	 */
 	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    req->r_locked_dir &&
 	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
 					       fsc->mount_options->snapdir_name,
 					       req->r_dentry->d_name.len))) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8e3fb69fbe6..1396ceb4679 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,7 +42,8 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
 	/* validate striping parameters */
 	if ((l->object_size & ~PAGE_MASK) ||
 	    (l->stripe_unit & ~PAGE_MASK) ||
-	    ((unsigned)l->object_size % (unsigned)l->stripe_unit))
+	    (l->stripe_unit != 0 &&
+	     ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
 		return -EINVAL;
 
 	/* make sure it's a valid data pool */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7dab9c04ad5..53cf2aabce8 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -328,7 +328,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
 	}
 
 ctoUTF16_out:
-	return i;
+	return j;
 }
 
 #ifdef CONFIG_CIFS_SMB2
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 074923ce593..f0cf934ba87 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1576,9 +1576,14 @@ cifs_readv_callback(struct mid_q_entry *mid)
 		/* result already set, check signature */
 		if (server->sec_mode &
 		    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
-			if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
-					  server, mid->sequence_number + 1))
-				cERROR(1, "Unexpected SMB signature");
+			int rc = 0;
+
+			rc = cifs_verify_signature(rdata->iov, rdata->nr_iov,
+						   server,
+						   mid->sequence_number + 1);
+			if (rc)
+				cERROR(1, "SMB signature verification returned "
+				       "error = %d", rc);
 		}
 		/* FIXME: should this be counted toward the initiating task? */
 		task_io_account_read(rdata->bytes);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index cbe709ad666..781025be48b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -356,19 +356,12 @@ cifs_create_get_file_info:
 cifs_create_set_dentry:
 	if (rc != 0) {
 		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+		CIFSSMBClose(xid, tcon, *fileHandle);
 		goto out;
 	}
 	d_drop(direntry);
 	d_add(direntry, newinode);
 
-	/* ENOENT for create?  How weird... */
-	rc = -ENOENT;
-	if (!newinode) {
-		CIFSSMBClose(xid, tcon, *fileHandle);
-		goto out;
-	}
-	rc = 0;
-
 out:
 	kfree(buf);
 	kfree(full_path);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9154192b068..71e9ad9f596 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -917,7 +917,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 	if (!buf) {
 		mutex_unlock(&cinode->lock_mutex);
 		free_xid(xid);
-		return rc;
+		return -ENOMEM;
 	}
 
 	for (i = 0; i < 2; i++) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7354877fa3b..cb79c7edecb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -124,10 +124,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	unsigned long oldtime = cifs_i->time;
 
 	cifs_revalidate_cache(inode, fattr);
 
+	spin_lock(&inode->i_lock);
 	inode->i_atime = fattr->cf_atime;
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
@@ -148,9 +148,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	else
 		cifs_i->time = jiffies;
 
-	cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
-		 oldtime, cifs_i->time);
-
 	cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
 
 	cifs_i->server_eof = fattr->cf_eof;
@@ -158,7 +155,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	 * Can't safely change the file size here if the client is writing to
 	 * it due to potential races.
 	 */
-	spin_lock(&inode->i_lock);
 	if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
 		i_size_write(inode, fattr->cf_eof);
 
@@ -859,12 +855,14 @@ struct inode *cifs_root_iget(struct super_block *sb)
 
 	if (rc && tcon->ipc) {
 		cFYI(1, "ipc connection - fake read inode");
+		spin_lock(&inode->i_lock);
 		inode->i_mode |= S_IFDIR;
 		set_nlink(inode, 2);
 		inode->i_op = &cifs_ipc_inode_ops;
 		inode->i_fop = &simple_dir_operations;
 		inode->i_uid = cifs_sb->mnt_uid;
 		inode->i_gid = cifs_sb->mnt_gid;
+		spin_unlock(&inode->i_lock);
 	} else if (rc) {
 		iget_failed(inode);
 		inode = ERR_PTR(rc);
@@ -1110,6 +1108,15 @@ undo_setattr:
 	goto out_close;
 }
 
+/* copied from fs/nfs/dir.c with small changes */
+static void
+cifs_drop_nlink(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	spin_unlock(&inode->i_lock);
+}
 
 /*
  * If dentry->d_inode is null (usually meaning the cached dentry
@@ -1166,13 +1173,13 @@ retry_std_delete:
 psx_del_no_retry:
 	if (!rc) {
 		if (inode)
-			drop_nlink(inode);
+			cifs_drop_nlink(inode);
 	} else if (rc == -ENOENT) {
 		d_drop(dentry);
 	} else if (rc == -ETXTBSY) {
 		rc = cifs_rename_pending_delete(full_path, dentry, xid);
 		if (rc == 0)
-			drop_nlink(inode);
+			cifs_drop_nlink(inode);
 	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
 		if (attrs == NULL) {
@@ -1241,9 +1248,10 @@ cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode,
 	 * setting nlink not necessary except in cases where we failed to get it
 	 * from the server or was set bogus
 	 */
+	spin_lock(&dentry->d_inode->i_lock);
 	if ((dentry->d_inode) && (dentry->d_inode->i_nlink < 2))
 		set_nlink(dentry->d_inode, 2);
-
+	spin_unlock(&dentry->d_inode->i_lock);
 	mode &= ~current_umask();
 	/* must turn on setgid bit if parent dir has it */
 	if (inode->i_mode & S_ISGID)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 09e4b3ae456..e6ce3b11287 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -433,7 +433,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	if (old_file->d_inode) {
 		cifsInode = CIFS_I(old_file->d_inode);
 		if (rc == 0) {
+			spin_lock(&old_file->d_inode->i_lock);
 			inc_nlink(old_file->d_inode);
+			spin_unlock(&old_file->d_inode->i_lock);
 /* BB should we make this contingent on superblock flag NOATIME? */
 /*			old_file->d_inode->i_ctime = CURRENT_TIME;*/
 			/* parent dir timestamps will update from srv
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index a4ff5d54755..e4d3b996416 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -52,7 +52,8 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 			cERROR(1, "Bad protocol string signature header %x",
 				  *(unsigned int *) hdr->ProtocolId);
 		if (mid != hdr->MessageId)
-			cERROR(1, "Mids do not match");
+			cERROR(1, "Mids do not match: %llu and %llu", mid,
+				  hdr->MessageId);
 	}
 	cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId);
 	return 1;
@@ -107,7 +108,7 @@ smb2_check_message(char *buf, unsigned int length)
 	 * ie Validate the wct via smb2_struct_sizes table above
 	 */
 
-	if (length < 2 + sizeof(struct smb2_hdr)) {
+	if (length < sizeof(struct smb2_pdu)) {
 		if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
 			pdu->StructureSize2 = 0;
 			/*
@@ -121,15 +122,15 @@ smb2_check_message(char *buf, unsigned int length)
 		return 1;
 	}
 	if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
-		cERROR(1, "SMB length greater than maximum, mid=%lld", mid);
+		cERROR(1, "SMB length greater than maximum, mid=%llu", mid);
 		return 1;
 	}
 
 	if (check_smb2_hdr(hdr, mid))
 		return 1;
 
-	if (hdr->StructureSize != SMB2_HEADER_SIZE) {
-		cERROR(1, "Illegal structure size %d",
+	if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+		cERROR(1, "Illegal structure size %u",
 			  le16_to_cpu(hdr->StructureSize));
 		return 1;
 	}
@@ -161,8 +162,9 @@ smb2_check_message(char *buf, unsigned int length)
 	if (4 + len != clc_len) {
 		cFYI(1, "Calculated size %u length %u mismatch mid %llu",
 			clc_len, 4 + len, mid);
-		if (clc_len == 4 + len + 1) /* BB FIXME (fix samba) */
-			return 0; /* BB workaround Samba 3 bug SessSetup rsp */
+		/* server can return one byte more */
+		if (clc_len == 4 + len + 1)
+			return 0;
 		return 1;
 	}
 	return 0;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f37a1b41b40..15dc8eea827 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -87,10 +87,6 @@
 
 #define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe)
 
-#define SMB2_HEADER_SIZE __constant_le16_to_cpu(64)
-
-#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_le16_to_cpu(9)
-
 /*
  * SMB2 Header Definition
  *
@@ -99,6 +95,9 @@
  * "PDU" :  "Protocol Data Unit" (ie a network "frame")
  *
  */
+
+#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
+
 struct smb2_hdr {
 	__be32 smb2_buf_length;	/* big endian on wire */
 				/* length is only two or three bytes - with
@@ -140,6 +139,9 @@ struct smb2_pdu {
  *  command code name for the struct. Note that structures must be packed.
  *
  */
+
+#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
+
 struct smb2_err_rsp {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 83867ef348d..d9b639b95fa 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -503,13 +503,16 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 	/* convert the length into a more usable form */
 	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
 		struct kvec iov;
+		int rc = 0;
 
 		iov.iov_base = mid->resp_buf;
 		iov.iov_len = len;
 		/* FIXME: add code to kill session */
-		if (cifs_verify_signature(&iov, 1, server,
-					  mid->sequence_number + 1) != 0)
-			cERROR(1, "Unexpected SMB signature");
+		rc = cifs_verify_signature(&iov, 1, server,
+					   mid->sequence_number + 1);
+		if (rc)
+			cERROR(1, "SMB signature verification returned error = "
+			       "%d", rc);
 	}
 
 	/* BB special case reconnect tid and uid here? */
diff --git a/fs/dcache.c b/fs/dcache.c
index 8086636bf79..693f95bf1ca 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -389,7 +389,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	 * Inform try_to_ascend() that we are no longer attached to the
 	 * dentry tree
 	 */
-	dentry->d_flags |= DCACHE_DISCONNECTED;
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
@@ -1048,7 +1048,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
 	 * or deletion
 	 */
 	if (new != old->d_parent ||
-		 (old->d_flags & DCACHE_DISCONNECTED) ||
+		 (old->d_flags & DCACHE_DENTRY_KILLED) ||
 		 (!locked && read_seqretry(&rename_lock, seq))) {
 		spin_unlock(&new->d_lock);
 		new = NULL;
@@ -1134,6 +1134,8 @@ positive:
 	return 1;
 
 rename_retry:
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
@@ -1141,7 +1143,7 @@ rename_retry:
 EXPORT_SYMBOL(have_submounts);
 
 /*
- * Search the dentry child list for the specified parent,
+ * Search the dentry child list of the specified parent,
  * and move any unused dentries to the end of the unused
  * list for prune_dcache(). We descend to the next level
  * whenever the d_subdirs list is non-empty and continue
@@ -1236,6 +1238,8 @@ out:
 rename_retry:
 	if (found)
 		return found;
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
@@ -3035,6 +3039,8 @@ resume:
 	return;
 
 rename_retry:
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 2340f6978d6..c5ca6ae5a30 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -526,73 +526,51 @@ struct array_data {
 	u32 elements;
 };
 
-static int u32_array_open(struct inode *inode, struct file *file)
-{
-	file->private_data = NULL;
-	return nonseekable_open(inode, file);
-}
-
-static size_t format_array(char *buf, size_t bufsize, const char *fmt,
-			   u32 *array, u32 array_size)
+static size_t u32_format_array(char *buf, size_t bufsize,
+			       u32 *array, int array_size)
 {
 	size_t ret = 0;
-	u32 i;
 
-	for (i = 0; i < array_size; i++) {
+	while (--array_size >= 0) {
 		size_t len;
+		char term = array_size ? ' ' : '\n';
 
-		len = snprintf(buf, bufsize, fmt, array[i]);
-		len++;	/* ' ' or '\n' */
+		len = snprintf(buf, bufsize, "%u%c", *array++, term);
 		ret += len;
 
-		if (buf) {
-			buf += len;
-			bufsize -= len;
-			buf[-1] = (i == array_size-1) ? '\n' : ' ';
-		}
+		buf += len;
+		bufsize -= len;
 	}
-
-	ret++;		/* \0 */
-	if (buf)
-		*buf = '\0';
-
 	return ret;
 }
 
-static char *format_array_alloc(const char *fmt, u32 *array,
-						u32 array_size)
+static int u32_array_open(struct inode *inode, struct file *file)
 {
-	size_t len = format_array(NULL, 0, fmt, array, array_size);
-	char *ret;
-
-	ret = kmalloc(len, GFP_KERNEL);
-	if (ret == NULL)
-		return NULL;
+	struct array_data *data = inode->i_private;
+	int size, elements = data->elements;
+	char *buf;
+
+	/*
+	 * Max size:
+	 *  - 10 digits + ' '/'\n' = 11 bytes per number
+	 *  - terminating NUL character
+	 */
+	size = elements*11;
+	buf = kmalloc(size+1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	buf[size] = 0;
+
+	file->private_data = buf;
+	u32_format_array(buf, size, data->array, data->elements);
 
-	format_array(ret, len, fmt, array, array_size);
-	return ret;
+	return nonseekable_open(inode, file);
 }
 
 static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
 			      loff_t *ppos)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct array_data *data = inode->i_private;
-	size_t size;
-
-	if (*ppos == 0) {
-		if (file->private_data) {
-			kfree(file->private_data);
-			file->private_data = NULL;
-		}
-
-		file->private_data = format_array_alloc("%u", data->array,
-							      data->elements);
-	}
-
-	size = 0;
-	if (file->private_data)
-		size = strlen(file->private_data);
+	size_t size = strlen(file->private_data);
 
 	return simple_read_from_buffer(buf, len, ppos,
 					file->private_data, size);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1faf4cb56f3..f86c720dba0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	unsigned long user_addr;
 	size_t bytes;
 	struct buffer_head map_bh = { 0, };
+	struct blk_plug plug;
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 				PAGE_SIZE - user_addr / PAGE_SIZE);
 	}
 
+	blk_start_plug(&plug);
+
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
 		sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (sdio.bio)
 		dio_bio_submit(dio, &sdio);
 
+	blk_finish_plug(&plug);
+
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 44ce5c6a541..d45ba456812 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -275,8 +275,14 @@ out:
 
 static int ecryptfs_flush(struct file *file, fl_owner_t td)
 {
-	return file->f_mode & FMODE_WRITE
-	       ? filemap_write_and_wait(file->f_mapping) : 0;
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+
+	if (lower_file->f_op && lower_file->f_op->flush) {
+		filemap_write_and_wait(file->f_mapping);
+		return lower_file->f_op->flush(lower_file, td);
+	}
+
+	return 0;
 }
 
 static int ecryptfs_release(struct inode *inode, struct file *file)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 534b129ea67..cc7709e7c50 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -619,6 +619,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_old_dir_dentry;
 	struct dentry *lower_new_dir_dentry;
 	struct dentry *trap = NULL;
+	struct inode *target_inode;
 
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -626,6 +627,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	dget(lower_new_dentry);
 	lower_old_dir_dentry = dget_parent(lower_old_dentry);
 	lower_new_dir_dentry = dget_parent(lower_new_dentry);
+	target_inode = new_dentry->d_inode;
 	trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	/* source should not be ancestor of target */
 	if (trap == lower_old_dentry) {
@@ -641,6 +643,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			lower_new_dir_dentry->d_inode, lower_new_dentry);
 	if (rc)
 		goto out_lock;
+	if (target_inode)
+		fsstack_copy_attr_all(target_inode,
+				      ecryptfs_inode_to_lower(target_inode));
 	fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
 	if (new_dir != old_dir)
 		fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 2768138eefe..9b627c15010 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -162,6 +162,7 @@ void ecryptfs_put_lower_file(struct inode *inode)
 	inode_info = ecryptfs_inode_to_private(inode);
 	if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count,
 				      &inode_info->lower_file_mutex)) {
+		filemap_write_and_wait(inode->i_mapping);
 		fput(inode_info->lower_file);
 		inode_info->lower_file = NULL;
 		mutex_unlock(&inode_info->lower_file_mutex);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1c8b5567080..eedec84c180 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1654,8 +1654,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 		error = PTR_ERR(file);
 		goto out_free_fd;
 	}
-	fd_install(fd, file);
 	ep->file = file;
+	fd_install(fd, file);
 	return fd;
 
 out_free_fd:
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 376aa77f3ca..2616d0ea5c5 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -479,7 +479,7 @@ void ext2_discard_reservation(struct inode *inode)
 /**
  * ext2_free_blocks() -- Free given blocks and update quota and i_blocks
  * @inode:		inode
- * @block:		start physcial block to free
+ * @block:		start physical block to free
  * @count:		number of blocks to free
  */
 void ext2_free_blocks (struct inode * inode, unsigned long block,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 90d901f0486..7320a66e958 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -483,7 +483,7 @@ void ext3_discard_reservation(struct inode *inode)
  * ext3_free_blocks_sb() -- Free given blocks and update quota
  * @handle:			handle to this transaction
  * @sb:				super block
- * @block:			start physcial block to free
+ * @block:			start physical block to free
  * @count:			number of blocks to free
  * @pdquot_freed_blocks:	pointer to quota
  */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a07597307fd..7e87e37a372 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3072,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle,
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
+	int need_datasync = 0;
+	__le32 disksize;
 	uid_t i_uid;
 	gid_t i_gid;
 
@@ -3113,7 +3115,11 @@ again:
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+	disksize = cpu_to_le32(ei->i_disksize);
+	if (disksize != raw_inode->i_size) {
+		need_datasync = 1;
+		raw_inode->i_size = disksize;
+	}
 	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -3129,8 +3135,11 @@ again:
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
 	} else {
-		raw_inode->i_size_high =
-			cpu_to_le32(ei->i_disksize >> 32);
+		disksize = cpu_to_le32(ei->i_disksize >> 32);
+		if (disksize != raw_inode->i_size_high) {
+			raw_inode->i_size_high = disksize;
+			need_datasync = 1;
+		}
 		if (ei->i_disksize > 0x7fffffffULL) {
 			struct super_block *sb = inode->i_sb;
 			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -3183,6 +3192,8 @@ again:
 	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
 
 	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+	if (need_datasync)
+		atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
 out_brelse:
 	brelse (bh);
 	ext3_std_error(inode->i_sb, err);
@@ -3196,7 +3207,7 @@ out_brelse:
  *
  * - Within generic_file_write() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
- *   trasnaction to commit.
+ *   transaction to commit.
  *
  * - Within sys_sync(), kupdate and such.
  *   We wait on commit, if tol to.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dff171c3a12..c862ee5fe79 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3313,7 +3313,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
  * handle: The journal handle
  * inode:  The files inode
  * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the begining of the file)
+ * from:   The starting byte offset (from the beginning of the file)
  *         to begin discarding
  * len:    The length of bytes to discard
  * flags:  Optional flags that may be used:
@@ -3321,11 +3321,11 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
  *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
  *         Only zero the regions of the page whose buffer heads
  *         have already been unmapped.  This flag is appropriate
- *         for updateing the contents of a page whose blocks may
+ *         for updating the contents of a page whose blocks may
  *         have already been released, and we only want to zero
  *         out the regions that correspond to those released blocks.
  *
- * Returns zero on sucess or negative on failure.
+ * Returns zero on success or negative on failure.
  */
 static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 		struct inode *inode, struct page *page, loff_t from,
@@ -3486,7 +3486,7 @@ int ext4_can_truncate(struct inode *inode)
  * @offset: The offset where the hole will begin
  * @len:    The length of the hole
  *
- * Returns: 0 on sucess or negative on failure
+ * Returns: 0 on success or negative on failure
  */
 
 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
@@ -4008,7 +4008,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
 
 	if (i_blocks <= ~0U) {
 		/*
-		 * i_blocks can be represnted in a 32 bit variable
+		 * i_blocks can be represented in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
@@ -4169,7 +4169,7 @@ out_brelse:
  *
  * - Within generic_file_write() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
- *   trasnaction to commit.
+ *   transaction to commit.
  *
  * - Within sys_sync(), kupdate and such.
  *   We wait on commit, if tol to.
@@ -4413,7 +4413,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  * worse case, the indexs blocks spread over different block groups
  *
  * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiuguous, with flexbg,
+ * different block groups too. If they are contiguous, with flexbg,
  * they could still across block group boundary.
  *
  * Also account for superblock, inode, quota and xattr blocks
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8eae94771c4..08778f6cdfe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4709,7 +4709,7 @@ error_return:
  * ext4_group_add_blocks() -- Add given blocks to an existing group
  * @handle:			handle to this transaction
  * @sb:				super block
- * @block:			start physcial block to add to the block group
+ * @block:			start physical block to add to the block group
  * @count:			number of blocks to free
  *
  * This marks the blocks as free in the bitmap and buddy.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be3efc4f64f..6d46c0d7833 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -577,10 +577,6 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
 /*
  * Write a portion of b_io inodes which belong to @sb.
  *
- * If @only_this_sb is true, then find and write all such
- * inodes. Otherwise write only ones which go sequentially
- * in reverse order.
- *
  * Return the number of pages and/or inodes written.
  */
 static long writeback_sb_inodes(struct super_block *sb,
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 03ff5b1eba9..75a20c092dd 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -117,7 +117,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
 					      const char __user *buf,
 					      size_t count, loff_t *ppos)
 {
-	unsigned val;
+	unsigned uninitialized_var(val);
 	ssize_t ret;
 
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -154,7 +154,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 						    const char __user *buf,
 						    size_t count, loff_t *ppos)
 {
-	unsigned val;
+	unsigned uninitialized_var(val);
 	ssize_t ret;
 
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3426521f320..ee8d5504229 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -396,7 +396,7 @@ err_device:
 err_region:
 	unregister_chrdev_region(devt, 1);
 err:
-	fc->conn_error = 1;
+	fuse_conn_kill(fc);
 	goto out;
 }
 
@@ -532,8 +532,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
 		cdev_del(cc->cdev);
 	}
 
-	/* kill connection and shutdown channel */
-	fuse_conn_kill(&cc->fc);
 	rc = fuse_dev_release(inode, file);	/* puts the base reference */
 
 	return rc;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7df2b5e8fbe..f4246cfc8d8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1576,6 +1576,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 		req->pages[req->num_pages] = page;
 		req->num_pages++;
 
+		offset = 0;
 		num -= this_num;
 		total_len += this_num;
 		index++;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ce0a2838ccd..fca222dabe3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -367,11 +367,6 @@ void fuse_conn_kill(struct fuse_conn *fc)
 	wake_up_all(&fc->waitq);
 	wake_up_all(&fc->blocked_waitq);
 	wake_up_all(&fc->reserved_req_waitq);
-	mutex_lock(&fuse_mutex);
-	list_del(&fc->entry);
-	fuse_ctl_remove_conn(fc);
-	mutex_unlock(&fuse_mutex);
-	fuse_bdi_destroy(fc);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_kill);
 
@@ -380,7 +375,14 @@ static void fuse_put_super(struct super_block *sb)
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
 	fuse_send_destroy(fc);
+
 	fuse_conn_kill(fc);
+	mutex_lock(&fuse_mutex);
+	list_del(&fc->entry);
+	fuse_ctl_remove_conn(fc);
+	mutex_unlock(&fuse_mutex);
+	fuse_bdi_destroy(fc);
+
 	fuse_conn_put(fc);
 }
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index d6526347d38..01c4975da4b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -612,6 +612,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	unsigned requested = 0;
 	int alloc_required;
 	int error = 0;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
@@ -641,7 +642,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		if (error)
 			goto out_unlock;
 
-		error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
+		requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip, requested);
 		if (error)
 			goto out_qunlock;
 	}
@@ -654,7 +656,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (&ip->i_inode == sdp->sd_rindex)
 		rblocks += 2 * RES_STATFS;
 	if (alloc_required)
-		rblocks += gfs2_rg_blocks(ip);
+		rblocks += gfs2_rg_blocks(ip, requested);
 
 	error = gfs2_trans_begin(sdp, rblocks,
 				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -868,8 +870,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	brelse(dibh);
 failed:
 	gfs2_trans_end(sdp);
-	if (gfs2_mb_reserved(ip))
-		gfs2_inplace_release(ip);
+	gfs2_inplace_release(ip);
 	if (ip->i_res->rs_qa_qd_num)
 		gfs2_quota_unlock(ip);
 	if (inode == sdp->sd_rindex) {
@@ -1023,7 +1024,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 				  offset, nr_segs, gfs2_get_block_direct,
 				  NULL, NULL, 0);
 out:
-	gfs2_glock_dq_m(1, &gh);
+	gfs2_glock_dq(&gh);
 	gfs2_holder_uninit(&gh);
 	return rv;
 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 49cd7dd4a9f..1fd3ae237bd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -786,7 +786,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		goto out_rlist;
 
 	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(ip->i_res);
+		gfs2_rs_deltree(ip, ip->i_res);
 
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d1d791ef38d..30e21997a1a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -323,6 +323,29 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 }
 
 /**
+ * gfs2_size_hint - Give a hint to the size of a write request
+ * @file: The struct file
+ * @offset: The file offset of the write
+ * @size: The length of the write
+ *
+ * When we are about to do a write, this function records the total
+ * write size in order to provide a suitable hint to the lower layers
+ * about how many blocks will be required.
+ *
+ */
+
+static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
+{
+	struct inode *inode = filep->f_dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
+	int hint = min_t(size_t, INT_MAX, blks);
+
+	atomic_set(&ip->i_res->rs_sizehint, hint);
+}
+
+/**
  * gfs2_allocate_page_backing - Use bmap to allocate blocks
  * @page: The (locked) page to allocate backing for
  *
@@ -382,8 +405,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		return ret;
 
-	atomic_set(&ip->i_res->rs_sizehint,
-		   PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift);
+	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -419,7 +441,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		rblocks += data_blocks ? data_blocks : 1;
 	if (ind_blocks || data_blocks) {
 		rblocks += RES_STATFS + RES_QUOTA;
-		rblocks += gfs2_rg_blocks(ip);
+		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
 	}
 	ret = gfs2_trans_begin(sdp, rblocks, 0);
 	if (ret)
@@ -663,7 +685,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (ret)
 		return ret;
 
-	atomic_set(&ip->i_res->rs_sizehint, writesize >> sdp->sd_sb.sb_bsize_shift);
+	gfs2_size_hint(file, pos, writesize);
+
 	if (file->f_flags & O_APPEND) {
 		struct gfs2_holder gh;
 
@@ -789,7 +812,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	if (unlikely(error))
 		goto out_uninit;
 
-	atomic_set(&ip->i_res->rs_sizehint, len >> sdp->sd_sb.sb_bsize_shift);
+	gfs2_size_hint(file, offset, len);
 
 	while (len > 0) {
 		if (len < bytes)
@@ -822,7 +845,7 @@ retry:
 				&max_bytes, &data_blocks, &ind_blocks);
 
 		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + gfs2_rg_blocks(ip);
+			  RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
 		if (gfs2_is_jdata(ip))
 			rblocks += data_blocks ? data_blocks : 1;
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 1ed81f40da0..e6c2fd53cab 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -186,20 +186,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 }
 
 /**
- * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- * If the glock is demotable, then we add it (or move it) to the end
- * of the glock LRU list.
- */
-
-static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-	if (demote_ok(gl))
-		gfs2_glock_add_to_lru(gl);
-}
-
-/**
  * gfs2_glock_put_nolock() - Decrement reference count on glock
  * @gl: The glock to put
  *
@@ -883,7 +869,14 @@ static int gfs2_glock_demote_wait(void *word)
 	return 0;
 }
 
-static void wait_on_holder(struct gfs2_holder *gh)
+/**
+ * gfs2_glock_wait - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
 {
 	unsigned long time1 = jiffies;
 
@@ -894,12 +887,7 @@ static void wait_on_holder(struct gfs2_holder *gh)
 		gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
 					      GL_GLOCK_HOLD_INCR,
 					      GL_GLOCK_MAX_HOLD);
-}
-
-static void wait_on_demote(struct gfs2_glock *gl)
-{
-	might_sleep();
-	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
+	return gh->gh_error;
 }
 
 /**
@@ -929,19 +917,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 	trace_gfs2_demote_rq(gl);
 }
 
-/**
- * gfs2_glock_wait - wait on a glock acquisition
- * @gh: the glock holder
- *
- * Returns: 0 on success
- */
-
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
-	wait_on_holder(gh);
-	return gh->gh_error;
-}
-
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
 	struct va_format vaf;
@@ -979,7 +954,7 @@ __acquires(&gl->gl_spin)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct list_head *insert_pt = NULL;
 	struct gfs2_holder *gh2;
-	int try_lock = 0;
+	int try_futile = 0;
 
 	BUG_ON(gh->gh_owner_pid == NULL);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
@@ -987,7 +962,7 @@ __acquires(&gl->gl_spin)
 
 	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
 		if (test_bit(GLF_LOCK, &gl->gl_flags))
-			try_lock = 1;
+			try_futile = !may_grant(gl, gh);
 		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
 			goto fail;
 	}
@@ -996,9 +971,8 @@ __acquires(&gl->gl_spin)
 		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
 		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
 			goto trap_recursive;
-		if (try_lock &&
-		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
-		    !may_grant(gl, gh)) {
+		if (try_futile &&
+		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
 fail:
 			gh->gh_error = GLR_TRYFAILED;
 			gfs2_holder_wake(gh);
@@ -1121,8 +1095,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
 			fast_path = 1;
 	}
-	if (!test_bit(GLF_LFLUSH, &gl->gl_flags))
-		__gfs2_glock_schedule_for_reclaim(gl);
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
+		gfs2_glock_add_to_lru(gl);
+
 	trace_gfs2_glock_queue(gh, 0);
 	spin_unlock(&gl->gl_spin);
 	if (likely(fast_path))
@@ -1141,7 +1116,8 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	gfs2_glock_dq(gh);
-	wait_on_demote(gl);
+	might_sleep();
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4bdcf378418..32cc4fde975 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -94,6 +94,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	/* A shortened, inline version of gfs2_trans_begin() */
 	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
 	tr.tr_ip = (unsigned long)__builtin_return_address(0);
+	sb_start_intwrite(sdp->sd_vfs);
 	gfs2_log_reserve(sdp, tr.tr_reserved);
 	BUG_ON(current->journal_info);
 	current->journal_info = &tr;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index aaecc8085fc..3d469d37345 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -99,9 +99,26 @@ struct gfs2_rgrpd {
 #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
 	spinlock_t rd_rsspin;           /* protects reservation related vars */
 	struct rb_root rd_rstree;       /* multi-block reservation tree */
-	u32 rd_rs_cnt;                  /* count of current reservations */
 };
 
+struct gfs2_rbm {
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_bitmap *bi;	/* Bitmap must belong to the rgd */
+	u32 offset;		/* The offset is bitmap relative */
+};
+
+static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
+{
+	return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset;
+}
+
+static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
+			       const struct gfs2_rbm *rbm2)
+{
+	return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) && 
+	       (rbm1->offset == rbm2->offset);
+}
+
 enum gfs2_state_bits {
 	BH_Pinned = BH_PrivateStart,
 	BH_Escaped = BH_PrivateStart + 1,
@@ -250,18 +267,11 @@ struct gfs2_blkreserv {
 	/* components used during write (step 1): */
 	atomic_t rs_sizehint;         /* hint of the write size */
 
-	/* components used during inplace_reserve (step 2): */
-	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
-
-	/* components used during get_local_rgrp (step 3): */
-	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
 	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
 	struct rb_node rs_node;       /* link to other block reservations */
-
-	/* components used during block searches and assignments (step 4): */
-	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
-	u32 rs_biblk;                 /* start block relative to the bi */
+	struct gfs2_rbm rs_rbm;       /* Start of reservation */
 	u32 rs_free;                  /* how many blocks are still free */
+	u64 rs_inum;                  /* Inode number for reservation */
 
 	/* ancillary quota stuff */
 	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4ce22e54730..381893ceefa 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -712,14 +712,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_gunlock2;
 
-	/* The newly created inode needs a reservation so it can allocate
-	   xattrs. At the same time, we want new blocks allocated to the new
-	   dinode to be as contiguous as possible. Since we allocated the
-	   dinode block under the directory's reservation, we transfer
-	   ownership of that reservation to the new inode. The directory
-	   doesn't need a reservation unless it needs a new allocation. */
-	ip->i_res = dip->i_res;
-	dip->i_res = NULL;
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto fail_gunlock2;
 
 	error = gfs2_acl_create(dip, inode);
 	if (error)
@@ -737,10 +732,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		brelse(bh);
 
 	gfs2_trans_end(sdp);
-	/* Check if we reserved space in the rgrp. Function link_dinode may
-	   not, depending on whether alloc is required. */
-	if (gfs2_mb_reserved(dip))
-		gfs2_inplace_release(dip);
+	gfs2_inplace_release(dip);
 	gfs2_quota_unlock(dip);
 	mark_inode_dirty(inode);
 	gfs2_glock_dq_uninit_m(2, ghs);
@@ -897,7 +889,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(dip) +
+					 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
 					 2 * RES_DINODE + RES_STATFS +
 					 RES_QUOTA, 0);
 		if (error)
@@ -1378,7 +1370,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(ndip) +
+					 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
 					 4 * RES_DINODE + 4 * RES_LEAF +
 					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
@@ -1722,7 +1714,9 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret == 0) {
-		ret = generic_setxattr(dentry, name, data, size, flags);
+		ret = gfs2_rs_alloc(ip);
+		if (ret == 0)
+			ret = generic_setxattr(dentry, name, data, size, flags);
 		gfs2_glock_dq(&gh);
 	}
 	gfs2_holder_uninit(&gh);
@@ -1757,7 +1751,9 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret == 0) {
-		ret = generic_removexattr(dentry, name);
+		ret = gfs2_rs_alloc(ip);
+		if (ret == 0)
+			ret = generic_removexattr(dentry, name);
 		gfs2_glock_dq(&gh);
 	}
 	gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e5af9dc420e..e443966c810 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/quotaops.h>
+#include <linux/lockdep.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -766,6 +767,7 @@ fail:
 	return error;
 }
 
+static struct lock_class_key gfs2_quota_imutex_key;
 
 static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
@@ -803,6 +805,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 		fs_err(sdp, "can't get quota file inode: %d\n", error);
 		goto fail_rindex;
 	}
+	/*
+	 * i_mutex on quota files is special. Since this inode is hidden system
+	 * file, we are safe to define locking ourselves.
+	 */
+	lockdep_set_class(&sdp->sd_quota_inode->i_mutex,
+			  &gfs2_quota_imutex_key);
 
 	error = gfs2_rindex_update(sdp);
 	if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a3bde91645c..4021deca61e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -765,6 +765,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	struct gfs2_holder *ghs, i_gh;
 	unsigned int qx, x;
 	struct gfs2_quota_data *qd;
+	unsigned reserved;
 	loff_t offset;
 	unsigned int nalloc = 0, blocks;
 	int error;
@@ -781,7 +782,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		return -ENOMEM;
 
 	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
-	mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
+	mutex_lock(&ip->i_inode.i_mutex);
 	for (qx = 0; qx < num_qd; qx++) {
 		error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
 					   GL_NOCACHE, &ghs[qx]);
@@ -811,13 +812,13 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	 * two blocks need to be updated instead of 1 */
 	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
 
-	error = gfs2_inplace_reserve(ip, 1 +
-				     (nalloc * (data_blocks + ind_blocks)));
+	reserved = 1 + (nalloc * (data_blocks + ind_blocks));
+	error = gfs2_inplace_reserve(ip, reserved);
 	if (error)
 		goto out_alloc;
 
 	if (nalloc)
-		blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
+		blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS;
 
 	error = gfs2_trans_begin(sdp, blocks, 0);
 	if (error)
@@ -1598,7 +1599,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		error = gfs2_inplace_reserve(ip, blocks);
 		if (error)
 			goto out_i;
-		blocks += gfs2_rg_blocks(ip);
+		blocks += gfs2_rg_blocks(ip, blocks);
 	}
 
 	/* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4d34887a601..3cc402ce6fe 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,9 +35,6 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
-#define RSRV_CONTENTION_FACTOR 4
-#define RGRP_RSRV_MAX_CONTENDERS 2
-
 #if BITS_PER_LONG == 32
 #define LBITMASK   (0x55555555UL)
 #define LBITSKIP55 (0x55555555UL)
@@ -67,53 +64,48 @@ static const char valid_change[16] = {
 	        1, 0, 0, 0
 };
 
-static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-			unsigned char old_state,
-			struct gfs2_bitmap **rbi);
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
+                         const struct gfs2_inode *ip, bool nowrap);
+
 
 /**
  * gfs2_setbit - Set a bit in the bitmaps
- * @rgd: the resource group descriptor
- * @buf2: the clone buffer that holds the bitmaps
- * @bi: the bitmap structure
- * @block: the block to set
+ * @rbm: The position of the bit to set
+ * @do_clone: Also set the clone bitmap, if it exists
  * @new_state: the new state of the block
  *
  */
 
-static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2,
-			       struct gfs2_bitmap *bi, u32 block,
+static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 			       unsigned char new_state)
 {
 	unsigned char *byte1, *byte2, *end, cur_state;
-	unsigned int buflen = bi->bi_len;
-	const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+	unsigned int buflen = rbm->bi->bi_len;
+	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
 
-	byte1 = bi->bi_bh->b_data + bi->bi_offset + (block / GFS2_NBBY);
-	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
+	byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
+	end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen;
 
 	BUG_ON(byte1 >= end);
 
 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
 
 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
-		printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, "
-		       "new_state=%d\n",
-		       (unsigned long long)block, cur_state, new_state);
-		printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n",
-		       (unsigned long long)rgd->rd_addr,
-		       (unsigned long)bi->bi_start);
-		printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n",
-		       (unsigned long)bi->bi_offset,
-		       (unsigned long)bi->bi_len);
+		printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, "
+		       "new_state=%d\n", rbm->offset, cur_state, new_state);
+		printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n",
+		       (unsigned long long)rbm->rgd->rd_addr,
+		       rbm->bi->bi_start);
+		printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n",
+		       rbm->bi->bi_offset, rbm->bi->bi_len);
 		dump_stack();
-		gfs2_consist_rgrpd(rgd);
+		gfs2_consist_rgrpd(rbm->rgd);
 		return;
 	}
 	*byte1 ^= (cur_state ^ new_state) << bit;
 
-	if (buf2) {
-		byte2 = buf2 + bi->bi_offset + (block / GFS2_NBBY);
+	if (do_clone && rbm->bi->bi_clone) {
+		byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
 		*byte2 ^= (cur_state ^ new_state) << bit;
 	}
@@ -121,30 +113,21 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2,
 
 /**
  * gfs2_testbit - test a bit in the bitmaps
- * @rgd: the resource group descriptor
- * @buffer: the buffer that holds the bitmaps
- * @buflen: the length (in bytes) of the buffer
- * @block: the block to read
+ * @rbm: The bit to test
  *
+ * Returns: The two bit block state of the requested bit
  */
 
-static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
-					 const unsigned char *buffer,
-					 unsigned int buflen, u32 block)
+static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
 {
-	const unsigned char *byte, *end;
-	unsigned char cur_state;
+	const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset;
+	const u8 *byte;
 	unsigned int bit;
 
-	byte = buffer + (block / GFS2_NBBY);
-	bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
-	end = buffer + buflen;
-
-	gfs2_assert(rgd->rd_sbd, byte < end);
+	byte = buffer + (rbm->offset / GFS2_NBBY);
+	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
 
-	cur_state = (*byte >> bit) & GFS2_BIT_MASK;
-
-	return cur_state;
+	return (*byte >> bit) & GFS2_BIT_MASK;
 }
 
 /**
@@ -192,7 +175,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
  */
 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
 {
-	u64 startblk = gfs2_rs_startblk(rs);
+	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
 
 	if (blk >= startblk + rs->rs_free)
 		return 1;
@@ -202,36 +185,6 @@ static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
 }
 
 /**
- * rs_find - Find a rgrp multi-block reservation that contains a given block
- * @rgd: The rgrp
- * @rgblk: The block we're looking for, relative to the rgrp
- */
-static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
-{
-	struct rb_node **newn;
-	int rc;
-	u64 fsblk = rgblk + rgd->rd_data0;
-
-	spin_lock(&rgd->rd_rsspin);
-	newn = &rgd->rd_rstree.rb_node;
-	while (*newn) {
-		struct gfs2_blkreserv *cur =
-			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
-		rc = rs_cmp(fsblk, 1, cur);
-		if (rc < 0)
-			newn = &((*newn)->rb_left);
-		else if (rc > 0)
-			newn = &((*newn)->rb_right);
-		else {
-			spin_unlock(&rgd->rd_rsspin);
-			return cur;
-		}
-	}
-	spin_unlock(&rgd->rd_rsspin);
-	return NULL;
-}
-
-/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buf: the buffer that holds the bitmaps
@@ -262,8 +215,6 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 	u64 mask = 0x5555555555555555ULL;
 	u32 bit;
 
-	BUG_ON(state > 3);
-
 	/* Mask off bits we don't care about at the start of the search */
 	mask <<= spoint;
 	tmp = gfs2_bit_search(ptr, mask, state);
@@ -285,6 +236,131 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 }
 
 /**
+ * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
+ * @rbm: The rbm with rgd already set correctly
+ * @block: The block number (filesystem relative)
+ *
+ * This sets the bi and offset members of an rbm based on a
+ * resource group and a filesystem relative block number. The
+ * resource group must be set in the rbm on entry, the bi and
+ * offset members will be set by this function.
+ *
+ * Returns: 0 on success, or an error code
+ */
+
+static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
+{
+	u64 rblock = block - rbm->rgd->rd_data0;
+	u32 goal = (u32)rblock;
+	int x;
+
+	if (WARN_ON_ONCE(rblock > UINT_MAX))
+		return -EINVAL;
+	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
+		return -E2BIG;
+
+	for (x = 0; x < rbm->rgd->rd_length; x++) {
+		rbm->bi = rbm->rgd->rd_bits + x;
+		if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) {
+			rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
+ * @rbm: Position to search (value/result)
+ * @n_unaligned: Number of unaligned blocks to check
+ * @len: Decremented for each block found (terminate on zero)
+ *
+ * Returns: true if a non-free block is encountered
+ */
+
+static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
+{
+	u64 block;
+	u32 n;
+	u8 res;
+
+	for (n = 0; n < n_unaligned; n++) {
+		res = gfs2_testbit(rbm);
+		if (res != GFS2_BLKST_FREE)
+			return true;
+		(*len)--;
+		if (*len == 0)
+			return true;
+		block = gfs2_rbm_to_block(rbm);
+		if (gfs2_rbm_from_block(rbm, block + 1))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * gfs2_free_extlen - Return extent length of free blocks
+ * @rbm: Starting position
+ * @len: Max length to check
+ *
+ * Starting at the block specified by the rbm, see how many free blocks
+ * there are, not reading more than len blocks ahead. This can be done
+ * using memchr_inv when the blocks are byte aligned, but has to be done
+ * on a block by block basis in case of unaligned blocks. Also this
+ * function can cope with bitmap boundaries (although it must stop on
+ * a resource group boundary)
+ *
+ * Returns: Number of free blocks in the extent
+ */
+
+static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
+{
+	struct gfs2_rbm rbm = *rrbm;
+	u32 n_unaligned = rbm.offset & 3;
+	u32 size = len;
+	u32 bytes;
+	u32 chunk_size;
+	u8 *ptr, *start, *end;
+	u64 block;
+
+	if (n_unaligned &&
+	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
+		goto out;
+
+	n_unaligned = len & 3;
+	/* Start is now byte aligned */
+	while (len > 3) {
+		start = rbm.bi->bi_bh->b_data;
+		if (rbm.bi->bi_clone)
+			start = rbm.bi->bi_clone;
+		end = start + rbm.bi->bi_bh->b_size;
+		start += rbm.bi->bi_offset;
+		BUG_ON(rbm.offset & 3);
+		start += (rbm.offset / GFS2_NBBY);
+		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
+		ptr = memchr_inv(start, 0, bytes);
+		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
+		chunk_size *= GFS2_NBBY;
+		BUG_ON(len < chunk_size);
+		len -= chunk_size;
+		block = gfs2_rbm_to_block(&rbm);
+		gfs2_rbm_from_block(&rbm, block + chunk_size);
+		n_unaligned = 3;
+		if (ptr)
+			break;
+		n_unaligned = len & 3;
+	}
+
+	/* Deal with any bits left over at the end */
+	if (n_unaligned)
+		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
+out:
+	return size - len;
+}
+
+/**
  * gfs2_bitcount - count the number of bits in a certain state
  * @rgd: the resource group descriptor
  * @buffer: the buffer that holds the bitmaps
@@ -487,6 +563,8 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	if (!res)
 		error = -ENOMEM;
 
+	RB_CLEAR_NODE(&res->rs_node);
+
 	down_write(&ip->i_rw_mutex);
 	if (ip->i_res)
 		kmem_cache_free(gfs2_rsrv_cachep, res);
@@ -496,11 +574,12 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	return error;
 }
 
-static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
 {
-	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u f:%u\n",
-		       rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
-		       rs->rs_free);
+	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
+		       (unsigned long long)rs->rs_inum,
+		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
+		       rs->rs_rbm.offset, rs->rs_free);
 }
 
 /**
@@ -508,41 +587,26 @@ static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
  * @rs: The reservation to remove
  *
  */
-static void __rs_deltree(struct gfs2_blkreserv *rs)
+static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
 {
 	struct gfs2_rgrpd *rgd;
 
 	if (!gfs2_rs_active(rs))
 		return;
 
-	rgd = rs->rs_rgd;
-	/* We can't do this: The reason is that when the rgrp is invalidated,
-	   it's in the "middle" of acquiring the glock, but the HOLDER bit
-	   isn't set yet:
-	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
-	trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
-
-	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
-		rb_erase(&rs->rs_node, &rgd->rd_rstree);
-	BUG_ON(!rgd->rd_rs_cnt);
-	rgd->rd_rs_cnt--;
+	rgd = rs->rs_rbm.rgd;
+	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
+	rb_erase(&rs->rs_node, &rgd->rd_rstree);
+	RB_CLEAR_NODE(&rs->rs_node);
 
 	if (rs->rs_free) {
 		/* return reserved blocks to the rgrp and the ip */
-		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
-		rs->rs_rgd->rd_reserved -= rs->rs_free;
+		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
+		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
 		rs->rs_free = 0;
-		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+		clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags);
 		smp_mb__after_clear_bit();
 	}
-	/* We can't change any of the step 1 or step 2 components of the rs.
-	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
-	   dequeued through this pointer.
-	   Can't: atomic_set(&rs->rs_sizehint, 0);
-	   Can't: rs->rs_requested = 0;
-	   Can't: rs->rs_rgd = NULL;*/
-	rs->rs_bi = NULL;
-	rs->rs_biblk = 0;
 }
 
 /**
@@ -550,17 +614,16 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
  * @rs: The reservation to remove
  *
  */
-void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
 {
 	struct gfs2_rgrpd *rgd;
 
-	if (!gfs2_rs_active(rs))
-		return;
-
-	rgd = rs->rs_rgd;
-	spin_lock(&rgd->rd_rsspin);
-	__rs_deltree(rs);
-	spin_unlock(&rgd->rd_rsspin);
+	rgd = rs->rs_rbm.rgd;
+	if (rgd) {
+		spin_lock(&rgd->rd_rsspin);
+		__rs_deltree(ip, rs);
+		spin_unlock(&rgd->rd_rsspin);
+	}
 }
 
 /**
@@ -572,8 +635,7 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
 {
 	down_write(&ip->i_rw_mutex);
 	if (ip->i_res) {
-		gfs2_rs_deltree(ip->i_res);
-		trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
+		gfs2_rs_deltree(ip, ip->i_res);
 		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
 		ip->i_res = NULL;
@@ -597,7 +659,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd)
 	spin_lock(&rgd->rd_rsspin);
 	while ((n = rb_first(&rgd->rd_rstree))) {
 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
-		__rs_deltree(rs);
+		__rs_deltree(NULL, rs);
 	}
 	spin_unlock(&rgd->rd_rsspin);
 }
@@ -1270,211 +1332,276 @@ out:
 
 /**
  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
- * @bi: the bitmap with the blocks
  * @ip: the inode structure
- * @biblk: the 32-bit block number relative to the start of the bitmap
- * @amount: the number of blocks to reserve
  *
- * Returns: NULL - reservation was already taken, so not inserted
- *          pointer to the inserted reservation
  */
-static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
-				       struct gfs2_inode *ip, u32 biblk,
-				       int amount)
+static void rs_insert(struct gfs2_inode *ip)
 {
 	struct rb_node **newn, *parent = NULL;
 	int rc;
 	struct gfs2_blkreserv *rs = ip->i_res;
-	struct gfs2_rgrpd *rgd = rs->rs_rgd;
-	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
+	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
+
+	BUG_ON(gfs2_rs_active(rs));
 
 	spin_lock(&rgd->rd_rsspin);
 	newn = &rgd->rd_rstree.rb_node;
-	BUG_ON(!ip->i_res);
-	BUG_ON(gfs2_rs_active(rs));
-	/* Figure out where to put new node */
-	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
 	while (*newn) {
 		struct gfs2_blkreserv *cur =
 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
 
 		parent = *newn;
-		rc = rs_cmp(fsblock, amount, cur);
+		rc = rs_cmp(fsblock, rs->rs_free, cur);
 		if (rc > 0)
 			newn = &((*newn)->rb_right);
 		else if (rc < 0)
 			newn = &((*newn)->rb_left);
 		else {
 			spin_unlock(&rgd->rd_rsspin);
-			return NULL; /* reservation already in use */
+			WARN_ON(1);
+			return;
 		}
 	}
 
-	/* Do our reservation work */
-	rs = ip->i_res;
-	rs->rs_free = amount;
-	rs->rs_biblk = biblk;
-	rs->rs_bi = bi;
 	rb_link_node(&rs->rs_node, parent, newn);
 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
 
-	/* Do our inode accounting for the reservation */
-	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
-
 	/* Do our rgrp accounting for the reservation */
-	rgd->rd_reserved += amount; /* blocks reserved */
-	rgd->rd_rs_cnt++; /* number of in-tree reservations */
+	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
 	spin_unlock(&rgd->rd_rsspin);
-	trace_gfs2_rs(ip, rs, TRACE_RS_INSERT);
-	return rs;
+	trace_gfs2_rs(rs, TRACE_RS_INSERT);
 }
 
 /**
- * unclaimed_blocks - return number of blocks that aren't spoken for
- */
-static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
-{
-	return rgd->rd_free_clone - rgd->rd_reserved;
-}
-
-/**
- * rg_mblk_search - find a group of multiple free blocks
+ * rg_mblk_search - find a group of multiple free blocks to form a reservation
  * @rgd: the resource group descriptor
- * @rs: the block reservation
  * @ip: pointer to the inode for which we're reserving blocks
+ * @requested: number of blocks required for this allocation
  *
- * This is very similar to rgblk_search, except we're looking for whole
- * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
- * on aligned dwords for speed's sake.
- *
- * Returns: 0 if successful or BFITNOENT if there isn't enough free space
  */
 
-static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
+			   unsigned requested)
 {
-	struct gfs2_bitmap *bi = rgd->rd_bits;
-	const u32 length = rgd->rd_length;
-	u32 blk;
-	unsigned int buf, x, search_bytes;
-	u8 *buffer = NULL;
-	u8 *ptr, *end, *nonzero;
-	u32 goal, rsv_bytes;
-	struct gfs2_blkreserv *rs;
-	u32 best_rs_bytes, unclaimed;
-	int best_rs_blocks;
+	struct gfs2_rbm rbm = { .rgd = rgd, };
+	u64 goal;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	u32 extlen;
+	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
+	int ret;
+
+	extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+	extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
+		return;
 
 	/* Find bitmap block that contains bits for goal block */
 	if (rgrp_contains_block(rgd, ip->i_goal))
-		goal = ip->i_goal - rgd->rd_data0;
+		goal = ip->i_goal;
 	else
-		goal = rgd->rd_last_alloc;
-	for (buf = 0; buf < length; buf++) {
-		bi = rgd->rd_bits + buf;
-		/* Convert scope of "goal" from rgrp-wide to within
-		   found bit block */
-		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
-			goal -= bi->bi_start * GFS2_NBBY;
-			goto do_search;
-		}
+		goal = rgd->rd_last_alloc + rgd->rd_data0;
+
+	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
+		return;
+
+	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
+	if (ret == 0) {
+		rs->rs_rbm = rbm;
+		rs->rs_free = extlen;
+		rs->rs_inum = ip->i_no_addr;
+		rs_insert(ip);
 	}
-	buf = 0;
-	goal = 0;
-
-do_search:
-	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
-			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
-	best_rs_bytes = (best_rs_blocks *
-			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
-		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
-	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
-	unclaimed = unclaimed_blocks(rgd);
-	if (best_rs_bytes * GFS2_NBBY > unclaimed)
-		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
-
-	for (x = 0; x <= length; x++) {
-		bi = rgd->rd_bits + buf;
+}
 
-		if (test_bit(GBF_FULL, &bi->bi_flags))
-			goto skip;
+/**
+ * gfs2_next_unreserved_block - Return next block that is not reserved
+ * @rgd: The resource group
+ * @block: The starting block
+ * @length: The required length
+ * @ip: Ignore any reservations for this inode
+ *
+ * If the block does not appear in any reservation, then return the
+ * block number unchanged. If it does appear in the reservation, then
+ * keep looking through the tree of reservations in order to find the
+ * first block number which is not reserved.
+ */
 
-		WARN_ON(!buffer_uptodate(bi->bi_bh));
-		if (bi->bi_clone)
-			buffer = bi->bi_clone + bi->bi_offset;
+static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
+				      u32 length,
+				      const struct gfs2_inode *ip)
+{
+	struct gfs2_blkreserv *rs;
+	struct rb_node *n;
+	int rc;
+
+	spin_lock(&rgd->rd_rsspin);
+	n = rgd->rd_rstree.rb_node;
+	while (n) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		rc = rs_cmp(block, length, rs);
+		if (rc < 0)
+			n = n->rb_left;
+		else if (rc > 0)
+			n = n->rb_right;
 		else
-			buffer = bi->bi_bh->b_data + bi->bi_offset;
-
-		/* We have to keep the reservations aligned on u64 boundaries
-		   otherwise we could get situations where a byte can't be
-		   used because it's after a reservation, but a free bit still
-		   is within the reservation's area. */
-		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
-		end = (buffer + bi->bi_len);
-		while (ptr < end) {
-			rsv_bytes = 0;
-			if ((ptr + best_rs_bytes) <= end)
-				search_bytes = best_rs_bytes;
-			else
-				search_bytes = end - ptr;
-			BUG_ON(!search_bytes);
-			nonzero = memchr_inv(ptr, 0, search_bytes);
-			/* If the lot is all zeroes, reserve the whole size. If
-			   there's enough zeroes to satisfy the request, use
-			   what we can. If there's not enough, keep looking. */
-			if (nonzero == NULL)
-				rsv_bytes = search_bytes;
-			else if ((nonzero - ptr) * GFS2_NBBY >=
-				 ip->i_res->rs_requested)
-				rsv_bytes = (nonzero - ptr);
-
-			if (rsv_bytes) {
-				blk = ((ptr - buffer) * GFS2_NBBY);
-				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
-				rs = rs_insert(bi, ip, blk,
-					       rsv_bytes * GFS2_NBBY);
-				if (IS_ERR(rs))
-					return PTR_ERR(rs);
-				if (rs)
-					return 0;
-			}
-			ptr += ALIGN(search_bytes, sizeof(u64));
+			break;
+	}
+
+	if (n) {
+		while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
+			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
+			n = n->rb_right;
+			if (n == NULL)
+				break;
+			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
 		}
-skip:
-		/* Try next bitmap block (wrap back to rgrp header
-		   if at end) */
-		buf++;
-		buf %= length;
-		goal = 0;
 	}
 
-	return BFITNOENT;
+	spin_unlock(&rgd->rd_rsspin);
+	return block;
 }
 
 /**
- * try_rgrp_fit - See if a given reservation will fit in a given RG
- * @rgd: the RG data
- * @ip: the inode
+ * gfs2_reservation_check_and_update - Check for reservations during block alloc
+ * @rbm: The current position in the resource group
+ * @ip: The inode for which we are searching for blocks
+ * @minext: The minimum extent length
  *
- * If there's room for the requested blocks to be allocated from the RG:
- * This will try to get a multi-block reservation first, and if that doesn't
- * fit, it will take what it can.
+ * This checks the current position in the rgrp to see whether there is
+ * a reservation covering this block. If not then this function is a
+ * no-op. If there is, then the position is moved to the end of the
+ * contiguous reservation(s) so that we are pointing at the first
+ * non-reserved block.
  *
- * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
+ * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
  */
 
-static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
+					     const struct gfs2_inode *ip,
+					     u32 minext)
 {
-	struct gfs2_blkreserv *rs = ip->i_res;
+	u64 block = gfs2_rbm_to_block(rbm);
+	u32 extlen = 1;
+	u64 nblock;
+	int ret;
 
-	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+	/*
+	 * If we have a minimum extent length, then skip over any extent
+	 * which is less than the min extent length in size.
+	 */
+	if (minext) {
+		extlen = gfs2_free_extlen(rbm, minext);
+		nblock = block + extlen;
+		if (extlen < minext)
+			goto fail;
+	}
+
+	/*
+	 * Check the extent which has been found against the reservations
+	 * and skip if parts of it are already reserved
+	 */
+	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
+	if (nblock == block)
 		return 0;
-	/* Look for a multi-block reservation. */
-	if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
-	    rg_mblk_search(rgd, ip) != BFITNOENT)
-		return 1;
-	if (unclaimed_blocks(rgd) >= rs->rs_requested)
-		return 1;
+fail:
+	ret = gfs2_rbm_from_block(rbm, nblock);
+	if (ret < 0)
+		return ret;
+	return 1;
+}
 
-	return 0;
+/**
+ * gfs2_rbm_find - Look for blocks of a particular state
+ * @rbm: Value/result starting position and final position
+ * @state: The state which we want to find
+ * @minext: The requested extent length (0 for a single block)
+ * @ip: If set, check for reservations
+ * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
+ *          around until we've reached the starting point.
+ *
+ * Side effects:
+ * - If looking for free blocks, we set GBF_FULL on each bitmap which
+ *   has no free blocks in it.
+ *
+ * Returns: 0 on success, -ENOSPC if there is no block of the requested state
+ */
+
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
+			 const struct gfs2_inode *ip, bool nowrap)
+{
+	struct buffer_head *bh;
+	struct gfs2_bitmap *initial_bi;
+	u32 initial_offset;
+	u32 offset;
+	u8 *buffer;
+	int index;
+	int n = 0;
+	int iters = rbm->rgd->rd_length;
+	int ret;
+
+	/* If we are not starting at the beginning of a bitmap, then we
+	 * need to add one to the bitmap count to ensure that we search
+	 * the starting bitmap twice.
+	 */
+	if (rbm->offset != 0)
+		iters++;
+
+	while(1) {
+		if (test_bit(GBF_FULL, &rbm->bi->bi_flags) &&
+		    (state == GFS2_BLKST_FREE))
+			goto next_bitmap;
+
+		bh = rbm->bi->bi_bh;
+		buffer = bh->b_data + rbm->bi->bi_offset;
+		WARN_ON(!buffer_uptodate(bh));
+		if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone)
+			buffer = rbm->bi->bi_clone + rbm->bi->bi_offset;
+		initial_offset = rbm->offset;
+		offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state);
+		if (offset == BFITNOENT)
+			goto bitmap_full;
+		rbm->offset = offset;
+		if (ip == NULL)
+			return 0;
+
+		initial_bi = rbm->bi;
+		ret = gfs2_reservation_check_and_update(rbm, ip, minext);
+		if (ret == 0)
+			return 0;
+		if (ret > 0) {
+			n += (rbm->bi - initial_bi);
+			goto next_iter;
+		}
+		if (ret == -E2BIG) {
+			index = 0;
+			rbm->offset = 0;
+			n += (rbm->bi - initial_bi);
+			goto res_covered_end_of_rgrp;
+		}
+		return ret;
+
+bitmap_full:	/* Mark bitmap as full and fall through */
+		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
+			set_bit(GBF_FULL, &rbm->bi->bi_flags);
+
+next_bitmap:	/* Find next bitmap in the rgrp */
+		rbm->offset = 0;
+		index = rbm->bi - rbm->rgd->rd_bits;
+		index++;
+		if (index == rbm->rgd->rd_length)
+			index = 0;
+res_covered_end_of_rgrp:
+		rbm->bi = &rbm->rgd->rd_bits[index];
+		if ((index == 0) && nowrap)
+			break;
+		n++;
+next_iter:
+		if (n >= iters)
+			break;
+	}
+
+	return -ENOSPC;
 }
 
 /**
@@ -1489,34 +1616,33 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 
 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
 {
-	u32 goal = 0, block;
-	u64 no_addr;
+	u64 block;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	struct gfs2_glock *gl;
 	struct gfs2_inode *ip;
 	int error;
 	int found = 0;
-	struct gfs2_bitmap *bi;
+	struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 };
 
-	while (goal < rgd->rd_data) {
+	while (1) {
 		down_write(&sdp->sd_log_flush_lock);
-		block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, &bi);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
 		up_write(&sdp->sd_log_flush_lock);
-		if (block == BFITNOENT)
+		if (error == -ENOSPC)
+			break;
+		if (WARN_ON_ONCE(error))
 			break;
 
-		block = gfs2_bi2rgd_blk(bi, block);
-		/* rgblk_search can return a block < goal, so we need to
-		   keep it marching forward. */
-		no_addr = block + rgd->rd_data0;
-		goal = max(block + 1, goal + 1);
-		if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
+		block = gfs2_rbm_to_block(&rbm);
+		if (gfs2_rbm_from_block(&rbm, block + 1))
+			break;
+		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
 			continue;
-		if (no_addr == skip)
+		if (block == skip)
 			continue;
-		*last_unlinked = no_addr;
+		*last_unlinked = block;
 
-		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+		error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
 		if (error)
 			continue;
 
@@ -1543,6 +1669,19 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 	return;
 }
 
+static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
+{
+	struct gfs2_rgrpd *rgd = *pos;
+
+	rgd = gfs2_rgrpd_get_next(rgd);
+	if (rgd == NULL)
+		rgd = gfs2_rgrpd_get_next(NULL);
+	*pos = rgd;
+	if (rgd != begin) /* If we didn't wrap */
+		return true;
+	return false;
+}
+
 /**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
@@ -1562,103 +1701,96 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	rs->rs_requested = requested;
-	if (gfs2_assert_warn(sdp, requested)) {
-		error = -EINVAL;
-		goto out;
-	}
+	if (gfs2_assert_warn(sdp, requested))
+		return -EINVAL;
 	if (gfs2_rs_active(rs)) {
-		begin = rs->rs_rgd;
+		begin = rs->rs_rbm.rgd;
 		flags = 0; /* Yoda: Do or do not. There is no try */
 	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
-		rs->rs_rgd = begin = ip->i_rgd;
+		rs->rs_rbm.rgd = begin = ip->i_rgd;
 	} else {
-		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
 	}
-	if (rs->rs_rgd == NULL)
+	if (rs->rs_rbm.rgd == NULL)
 		return -EBADSLT;
 
 	while (loops < 3) {
-		rg_locked = 0;
-
-		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
-			rg_locked = 1;
-			error = 0;
-		} else if (!loops && !gfs2_rs_active(rs) &&
-			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
-			/* If the rgrp already is maxed out for contenders,
-			   we can eliminate it as a "first pass" without even
-			   requesting the rgrp glock. */
-			error = GLR_TRYFAILED;
-		} else {
-			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
+		rg_locked = 1;
+
+		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
+			rg_locked = 0;
+			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
 						   LM_ST_EXCLUSIVE, flags,
 						   &rs->rs_rgd_gh);
-			if (!error && sdp->sd_args.ar_rgrplvb) {
-				error = update_rgrp_lvb(rs->rs_rgd);
-				if (error) {
+			if (error == GLR_TRYFAILED)
+				goto next_rgrp;
+			if (unlikely(error))
+				return error;
+			if (sdp->sd_args.ar_rgrplvb) {
+				error = update_rgrp_lvb(rs->rs_rbm.rgd);
+				if (unlikely(error)) {
 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 					return error;
 				}
 			}
 		}
-		switch (error) {
-		case 0:
-			if (gfs2_rs_active(rs)) {
-				if (unclaimed_blocks(rs->rs_rgd) +
-				    rs->rs_free >= rs->rs_requested) {
-					ip->i_rgd = rs->rs_rgd;
-					return 0;
-				}
-				/* We have a multi-block reservation, but the
-				   rgrp doesn't have enough free blocks to
-				   satisfy the request. Free the reservation
-				   and look for a suitable rgrp. */
-				gfs2_rs_deltree(rs);
-			}
-			if (try_rgrp_fit(rs->rs_rgd, ip)) {
-				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rs->rs_rgd);
-				ip->i_rgd = rs->rs_rgd;
-				return 0;
-			}
-			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
-				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rs->rs_rgd);
-				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
-						ip->i_no_addr);
-			}
-			if (!rg_locked)
-				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
-			/* fall through */
-		case GLR_TRYFAILED:
-			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
-			rs->rs_rgd = rs->rs_rgd ? : begin; /* if NULL, wrap */
-			if (rs->rs_rgd != begin) /* If we didn't wrap */
-				break;
 
-			flags &= ~LM_FLAG_TRY;
-			loops++;
-			/* Check that fs hasn't grown if writing to rindex */
-			if (ip == GFS2_I(sdp->sd_rindex) &&
-			    !sdp->sd_rindex_uptodate) {
-				error = gfs2_ri_update(ip);
-				if (error)
-					goto out;
-			} else if (loops == 2)
-				/* Flushing the log may release space */
-				gfs2_log_flush(sdp, NULL);
-			break;
-		default:
-			goto out;
+		/* Skip unuseable resource groups */
+		if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+			goto skip_rgrp;
+
+		if (sdp->sd_args.ar_rgrplvb)
+			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
+
+		/* Get a reservation if we don't already have one */
+		if (!gfs2_rs_active(rs))
+			rg_mblk_search(rs->rs_rbm.rgd, ip, requested);
+
+		/* Skip rgrps when we can't get a reservation on first pass */
+		if (!gfs2_rs_active(rs) && (loops < 1))
+			goto check_rgrp;
+
+		/* If rgrp has enough free space, use it */
+		if (rs->rs_rbm.rgd->rd_free_clone >= requested) {
+			ip->i_rgd = rs->rs_rbm.rgd;
+			return 0;
+		}
+
+		/* Drop reservation, if we couldn't use reserved rgrp */
+		if (gfs2_rs_active(rs))
+			gfs2_rs_deltree(ip, rs);
+check_rgrp:
+		/* Check for unlinked inodes which can be reclaimed */
+		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
+			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
+					ip->i_no_addr);
+skip_rgrp:
+		/* Unlock rgrp if required */
+		if (!rg_locked)
+			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+next_rgrp:
+		/* Find the next rgrp, and continue looking */
+		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
+			continue;
+
+		/* If we've scanned all the rgrps, but found no free blocks
+		 * then this checks for some less likely conditions before
+		 * trying again.
+		 */
+		flags &= ~LM_FLAG_TRY;
+		loops++;
+		/* Check that fs hasn't grown if writing to rindex */
+		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
+			error = gfs2_ri_update(ip);
+			if (error)
+				return error;
 		}
+		/* Flushing the log may release space */
+		if (loops == 2)
+			gfs2_log_flush(sdp, NULL);
 	}
-	error = -ENOSPC;
 
-out:
-	if (error)
-		rs->rs_requested = 0;
-	return error;
+	return -ENOSPC;
 }
 
 /**
@@ -1672,15 +1804,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
 
-	if (!rs)
-		return;
-
-	if (!rs->rs_free)
-		gfs2_rs_deltree(rs);
-
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
-	rs->rs_requested = 0;
 }
 
 /**
@@ -1693,173 +1818,47 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 
 static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 {
-	struct gfs2_bitmap *bi = NULL;
-	u32 length, rgrp_block, buf_block;
-	unsigned int buf;
-	unsigned char type;
-
-	length = rgd->rd_length;
-	rgrp_block = block - rgd->rd_data0;
-
-	for (buf = 0; buf < length; buf++) {
-		bi = rgd->rd_bits + buf;
-		if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
-			break;
-	}
+	struct gfs2_rbm rbm = { .rgd = rgd, };
+	int ret;
 
-	gfs2_assert(rgd->rd_sbd, buf < length);
-	buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
+	ret = gfs2_rbm_from_block(&rbm, block);
+	WARN_ON_ONCE(ret != 0);
 
-	type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
-			   bi->bi_len, buf_block);
-
-	return type;
+	return gfs2_testbit(&rbm);
 }
 
-/**
- * rgblk_search - find a block in @state
- * @rgd: the resource group descriptor
- * @goal: the goal block within the RG (start here to search for avail block)
- * @state: GFS2_BLKST_XXX the before-allocation state to find
- * @rbi: address of the pointer to the bitmap containing the block found
- *
- * Walk rgrp's bitmap to find bits that represent a block in @state.
- *
- * This function never fails, because we wouldn't call it unless we
- * know (from reservation results, etc.) that a block is available.
- *
- * Scope of @goal is just within rgrp, not the whole filesystem.
- * Scope of @returned block is just within bitmap, not the whole filesystem.
- *
- * Returns: the block number found relative to the bitmap rbi
- */
-
-static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, unsigned char state,
-			struct gfs2_bitmap **rbi)
-{
-	struct gfs2_bitmap *bi = NULL;
-	const u32 length = rgd->rd_length;
-	u32 biblk = BFITNOENT;
-	unsigned int buf, x;
-	const u8 *buffer = NULL;
-
-	*rbi = NULL;
-	/* Find bitmap block that contains bits for goal block */
-	for (buf = 0; buf < length; buf++) {
-		bi = rgd->rd_bits + buf;
-		/* Convert scope of "goal" from rgrp-wide to within found bit block */
-		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
-			goal -= bi->bi_start * GFS2_NBBY;
-			goto do_search;
-		}
-	}
-	buf = 0;
-	goal = 0;
-
-do_search:
-	/* Search (up to entire) bitmap in this rgrp for allocatable block.
-	   "x <= length", instead of "x < length", because we typically start
-	   the search in the middle of a bit block, but if we can't find an
-	   allocatable block anywhere else, we want to be able wrap around and
-	   search in the first part of our first-searched bit block.  */
-	for (x = 0; x <= length; x++) {
-		bi = rgd->rd_bits + buf;
-
-		if (test_bit(GBF_FULL, &bi->bi_flags) &&
-		    (state == GFS2_BLKST_FREE))
-			goto skip;
-
-		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
-		   bitmaps, so we must search the originals for that. */
-		buffer = bi->bi_bh->b_data + bi->bi_offset;
-		WARN_ON(!buffer_uptodate(bi->bi_bh));
-		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
-			buffer = bi->bi_clone + bi->bi_offset;
-
-		while (1) {
-			struct gfs2_blkreserv *rs;
-			u32 rgblk;
-
-			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
-			if (biblk == BFITNOENT)
-				break;
-			/* Check if this block is reserved() */
-			rgblk = gfs2_bi2rgd_blk(bi, biblk);
-			rs = rs_find(rgd, rgblk);
-			if (rs == NULL)
-				break;
-
-			BUG_ON(rs->rs_bi != bi);
-			biblk = BFITNOENT;
-			/* This should jump to the first block after the
-			   reservation. */
-			goal = rs->rs_biblk + rs->rs_free;
-			if (goal >= bi->bi_len * GFS2_NBBY)
-				break;
-		}
-		if (biblk != BFITNOENT)
-			break;
-
-		if ((goal == 0) && (state == GFS2_BLKST_FREE))
-			set_bit(GBF_FULL, &bi->bi_flags);
-
-		/* Try next bitmap block (wrap back to rgrp header if at end) */
-skip:
-		buf++;
-		buf %= length;
-		goal = 0;
-	}
-
-	if (biblk != BFITNOENT)
-		*rbi = bi;
-
-	return biblk;
-}
 
 /**
  * gfs2_alloc_extent - allocate an extent from a given bitmap
- * @rgd: the resource group descriptor
- * @bi: the bitmap within the rgrp
- * @blk: the block within the bitmap
+ * @rbm: the resource group information
  * @dinode: TRUE if the first block we allocate is for a dinode
- * @n: The extent length
+ * @n: The extent length (value/result)
  *
- * Add the found bitmap buffer to the transaction.
+ * Add the bitmap buffer to the transaction.
  * Set the found bits to @new_state to change block's allocation state.
- * Returns: starting block number of the extent (fs scope)
  */
-static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
-			     u32 blk, bool dinode, unsigned int *n)
+static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
+			     unsigned int *n)
 {
+	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
 	const unsigned int elen = *n;
-	u32 goal, rgblk;
-	const u8 *buffer = NULL;
-	struct gfs2_blkreserv *rs;
-
-	*n = 0;
-	buffer = bi->bi_bh->b_data + bi->bi_offset;
-	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-	gfs2_setbit(rgd, bi->bi_clone, bi, blk,
-		    dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
-	(*n)++;
-	goal = blk;
+	u64 block;
+	int ret;
+
+	*n = 1;
+	block = gfs2_rbm_to_block(rbm);
+	gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1);
+	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+	block++;
 	while (*n < elen) {
-		goal++;
-		if (goal >= (bi->bi_len * GFS2_NBBY))
-			break;
-		rgblk = gfs2_bi2rgd_blk(bi, goal);
-		rs = rs_find(rgd, rgblk);
-		if (rs) /* Oops, we bumped into someone's reservation */
-			break;
-		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-		    GFS2_BLKST_FREE)
+		ret = gfs2_rbm_from_block(&pos, block);
+		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
 			break;
-		gfs2_setbit(rgd, bi->bi_clone, bi, goal, GFS2_BLKST_USED);
+		gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1);
+		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
 		(*n)++;
+		block++;
 	}
-	blk = gfs2_bi2rgd_blk(bi, blk);
-	rgd->rd_last_alloc = blk + *n - 1;
-	return rgd->rd_data0 + blk;
 }
 
 /**
@@ -1875,46 +1874,30 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 				     u32 blen, unsigned char new_state)
 {
-	struct gfs2_rgrpd *rgd;
-	struct gfs2_bitmap *bi = NULL;
-	u32 length, rgrp_blk, buf_blk;
-	unsigned int buf;
+	struct gfs2_rbm rbm;
 
-	rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
-	if (!rgd) {
+	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
+	if (!rbm.rgd) {
 		if (gfs2_consist(sdp))
 			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
 		return NULL;
 	}
 
-	length = rgd->rd_length;
-
-	rgrp_blk = bstart - rgd->rd_data0;
-
 	while (blen--) {
-		for (buf = 0; buf < length; buf++) {
-			bi = rgd->rd_bits + buf;
-			if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
-				break;
-		}
-
-		gfs2_assert(rgd->rd_sbd, buf < length);
-
-		buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
-		rgrp_blk++;
-
-		if (!bi->bi_clone) {
-			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
-					       GFP_NOFS | __GFP_NOFAIL);
-			memcpy(bi->bi_clone + bi->bi_offset,
-			       bi->bi_bh->b_data + bi->bi_offset,
-			       bi->bi_len);
+		gfs2_rbm_from_block(&rbm, bstart);
+		bstart++;
+		if (!rbm.bi->bi_clone) {
+			rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size,
+						   GFP_NOFS | __GFP_NOFAIL);
+			memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset,
+			       rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
+			       rbm.bi->bi_len);
 		}
-		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-		gfs2_setbit(rgd, NULL, bi, buf_blk, new_state);
+		gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1);
+		gfs2_setbit(&rbm, false, new_state);
 	}
 
-	return rgd;
+	return rbm.rgd;
 }
 
 /**
@@ -1956,56 +1939,41 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 }
 
 /**
- * claim_reserved_blks - Claim previously reserved blocks
- * @ip: the inode that's claiming the reservation
- * @dinode: 1 if this block is a dinode block, otherwise data block
- * @nblocks: desired extent length
+ * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
+ * @ip: The inode we have just allocated blocks for
+ * @rbm: The start of the allocated blocks
+ * @len: The extent length
  *
- * Lay claim to previously allocated block reservation blocks.
- * Returns: Starting block number of the blocks claimed.
- * Sets *nblocks to the actual extent length allocated.
+ * Adjusts a reservation after an allocation has taken place. If the
+ * reservation does not match the allocation, or if it is now empty
+ * then it is removed.
  */
-static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
-			       unsigned int *nblocks)
+
+static void gfs2_adjust_reservation(struct gfs2_inode *ip,
+				    const struct gfs2_rbm *rbm, unsigned len)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
-	struct gfs2_rgrpd *rgd = rs->rs_rgd;
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_bitmap *bi;
-	u64 start_block = gfs2_rs_startblk(rs);
-	const unsigned int elen = *nblocks;
-
-	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
-	gfs2_assert_withdraw(sdp, rgd);
-	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
-	bi = rs->rs_bi;
-	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-
-	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
-		/* Make sure the bitmap hasn't changed */
-		gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
-			    dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
-		rs->rs_biblk++;
-		rs->rs_free--;
-
-		BUG_ON(!rgd->rd_reserved);
-		rgd->rd_reserved--;
-		dinode = false;
-		trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
-	}
-
-	if (!rs->rs_free) {
-		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
+	struct gfs2_rgrpd *rgd = rbm->rgd;
+	unsigned rlen;
+	u64 block;
+	int ret;
 
-		gfs2_rs_deltree(rs);
-		/* -nblocks because we haven't returned to do the math yet.
-		   I'm doing the math backwards to prevent negative numbers,
-		   but think of it as:
-		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
-		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
-			rg_mblk_search(rgd, ip);
+	spin_lock(&rgd->rd_rsspin);
+	if (gfs2_rs_active(rs)) {
+		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
+			block = gfs2_rbm_to_block(rbm);
+			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
+			rlen = min(rs->rs_free, len);
+			rs->rs_free -= rlen;
+			rgd->rd_reserved -= rlen;
+			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
+			if (rs->rs_free && !ret)
+				goto out;
+		}
+		__rs_deltree(ip, rs);
 	}
-	return start_block;
+out:
+	spin_unlock(&rgd->rd_rsspin);
 }
 
 /**
@@ -2024,47 +1992,40 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
-	struct gfs2_rgrpd *rgd;
+	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
 	unsigned int ndata;
-	u32 goal, blk; /* block, within the rgrp scope */
+	u64 goal;
 	u64 block; /* block, within the file system scope */
 	int error;
-	struct gfs2_bitmap *bi;
 
-	/* Only happens if there is a bug in gfs2, return something distinctive
-	 * to ensure that it is noticed.
-	 */
-	if (ip->i_res->rs_requested == 0)
-		return -ECANCELED;
-
-	/* Check if we have a multi-block reservation, and if so, claim the
-	   next free block from it. */
-	if (gfs2_rs_active(ip->i_res)) {
-		BUG_ON(!ip->i_res->rs_free);
-		rgd = ip->i_res->rs_rgd;
-		block = claim_reserved_blks(ip, dinode, nblocks);
-	} else {
-		rgd = ip->i_rgd;
+	if (gfs2_rs_active(ip->i_res))
+		goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm);
+	else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal))
+		goal = ip->i_goal;
+	else
+		goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0;
 
-		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
-			goal = ip->i_goal - rgd->rd_data0;
-		else
-			goal = rgd->rd_last_alloc;
-
-		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
-
-		/* Since all blocks are reserved in advance, this shouldn't
-		   happen */
-		if (blk == BFITNOENT) {
-			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
-			       *nblocks);
-			printk(KERN_WARNING "FULL=%d\n",
-			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
-			goto rgrp_error;
-		}
+	gfs2_rbm_from_block(&rbm, goal);
+	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
 
-		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+	if (error == -ENOSPC) {
+		gfs2_rbm_from_block(&rbm, goal);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
+	}
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (error) {
+		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
+			(unsigned long long)ip->i_no_addr, error, *nblocks,
+			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
+		goto rgrp_error;
 	}
+
+	gfs2_alloc_extent(&rbm, dinode, nblocks);
+	block = gfs2_rbm_to_block(&rbm);
+	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_adjust_reservation(ip, &rbm, *nblocks);
 	ndata = *nblocks;
 	if (dinode)
 		ndata--;
@@ -2081,22 +2042,22 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 			brelse(dibh);
 		}
 	}
-	if (rgd->rd_free < *nblocks) {
+	if (rbm.rgd->rd_free < *nblocks) {
 		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
 		goto rgrp_error;
 	}
 
-	rgd->rd_free -= *nblocks;
+	rbm.rgd->rd_free -= *nblocks;
 	if (dinode) {
-		rgd->rd_dinodes++;
-		*generation = rgd->rd_igeneration++;
+		rbm.rgd->rd_dinodes++;
+		*generation = rbm.rgd->rd_igeneration++;
 		if (*generation == 0)
-			*generation = rgd->rd_igeneration++;
+			*generation = rbm.rgd->rd_igeneration++;
 	}
 
-	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
 	if (dinode)
@@ -2110,14 +2071,14 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 		gfs2_quota_change(ip, ndata, ip->i_inode.i_uid,
 				  ip->i_inode.i_gid);
 
-	rgd->rd_free_clone -= *nblocks;
-	trace_gfs2_block_alloc(ip, rgd, block, *nblocks,
+	rbm.rgd->rd_free_clone -= *nblocks;
+	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
 	*bn = block;
 	return 0;
 
 rgrp_error:
-	gfs2_rgrp_error(rgd);
+	gfs2_rgrp_error(rbm.rgd);
 	return -EIO;
 }
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index ca6e26729b8..24077958dcf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -46,7 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
-extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
+extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs);
 extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -73,30 +73,10 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
-/* This is how to tell if a multi-block reservation is "inplace" reserved: */
-static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
+/* This is how to tell if a reservation is in the rgrp tree: */
+static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
 {
-	if (ip->i_res && ip->i_res->rs_requested)
-		return 1;
-	return 0;
-}
-
-/* This is how to tell if a multi-block reservation is in the rgrp tree: */
-static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
-{
-	if (rs && rs->rs_bi)
-		return 1;
-	return 0;
-}
-
-static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
-{
-	return (bi->bi_start * GFS2_NBBY) + blk;
-}
-
-static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
-{
-	return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
+	return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }
 
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index fc3168f47a1..a8d90f2f576 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1366,6 +1366,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 	val = sdp->sd_tune.gt_statfs_quantum;
 	if (val != 30)
 		seq_printf(s, ",statfs_quantum=%d", val);
+	else if (sdp->sd_tune.gt_statfs_slow)
+		seq_puts(s, ",statfs_quantum=0");
 	val = sdp->sd_tune.gt_quota_quantum;
 	if (val != 60)
 		seq_printf(s, ",quota_quantum=%d", val);
@@ -1543,6 +1545,11 @@ static void gfs2_evict_inode(struct inode *inode)
 
 out_truncate:
 	gfs2_log_flush(sdp, ip->i_gl);
+	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
+		struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+		filemap_fdatawrite(metamapping);
+		filemap_fdatawait(metamapping);
+	}
 	write_inode_now(inode, 1);
 	gfs2_ail_flush(ip->i_gl, 0);
 
@@ -1557,7 +1564,7 @@ out_truncate:
 out_unlock:
 	/* Error path for case 1 */
 	if (gfs2_rs_active(ip->i_res))
-		gfs2_rs_deltree(ip->i_res);
+		gfs2_rs_deltree(ip, ip->i_res);
 
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
 		gfs2_glock_dq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index a25c252fe41..bbdc78af60c 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -509,10 +509,9 @@ TRACE_EVENT(gfs2_block_alloc,
 /* Keep track of multi-block reservations as they are allocated/freed */
 TRACE_EVENT(gfs2_rs,
 
-	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
-		 u8 func),
+	TP_PROTO(const struct gfs2_blkreserv *rs, u8 func),
 
-	TP_ARGS(ip, rs, func),
+	TP_ARGS(rs, func),
 
 	TP_STRUCT__entry(
 		__field(        dev_t,  dev                     )
@@ -526,18 +525,17 @@ TRACE_EVENT(gfs2_rs,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
-		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
-		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
-		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
-		__entry->inum		= ip ? ip->i_no_addr : 0;
-		__entry->start		= gfs2_rs_startblk(rs);
+		__entry->dev		= rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev;
+		__entry->rd_addr	= rs->rs_rbm.rgd->rd_addr;
+		__entry->rd_free_clone	= rs->rs_rbm.rgd->rd_free_clone;
+		__entry->rd_reserved	= rs->rs_rbm.rgd->rd_reserved;
+		__entry->inum		= rs->rs_inum;
+		__entry->start		= gfs2_rbm_to_block(&rs->rs_rbm);
 		__entry->free		= rs->rs_free;
 		__entry->func		= func;
 	),
 
-	TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s "
-		  "f:%lu",
+	TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long long)__entry->inum,
 		  (unsigned long long)__entry->start,
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 41f42cdccbb..bf2ae9aeee7 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,11 +28,10 @@ struct gfs2_glock;
 
 /* reserve either the number of blocks to be allocated plus the rg header
  * block, or all of the blocks in the rg, whichever is smaller */
-static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested)
 {
-	const struct gfs2_blkreserv *rs = ip->i_res;
-	if (rs && rs->rs_requested < ip->i_rgd->rd_length)
-		return rs->rs_requested + 1;
+	if (requested < ip->i_rgd->rd_length)
+		return requested + 1;
 	return ip->i_rgd->rd_length;
 }
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 27a0b4a901f..db330e5518c 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -448,17 +448,18 @@ ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 
 /**
- * ea_get_unstuffed - actually copies the unstuffed data into the
- *                    request buffer
+ * ea_iter_unstuffed - copies the unstuffed xattr data to/from the
+ *                     request buffer
  * @ip: The GFS2 inode
  * @ea: The extended attribute header structure
- * @data: The data to be copied
+ * @din: The data to be copied in
+ * @dout: The data to be copied out (one of din,dout will be NULL)
  *
  * Returns: errno
  */
 
-static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
-			    char *data)
+static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+			       const char *din, char *dout)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head **bh;
@@ -467,6 +468,8 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
 	unsigned int x;
 	int error = 0;
+	unsigned char *pos;
+	unsigned cp_size;
 
 	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
 	if (!bh)
@@ -497,12 +500,21 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			goto out;
 		}
 
-		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
-		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+		pos = bh[x]->b_data + sizeof(struct gfs2_meta_header);
+		cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize;
 
-		amount -= sdp->sd_jbsize;
-		data += sdp->sd_jbsize;
+		if (dout) {
+			memcpy(dout, pos, cp_size);
+			dout += sdp->sd_jbsize;
+		}
+
+		if (din) {
+			gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+			memcpy(pos, din, cp_size);
+			din += sdp->sd_jbsize;
+		}
 
+		amount -= sdp->sd_jbsize;
 		brelse(bh[x]);
 	}
 
@@ -523,7 +535,7 @@ static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
 		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
 		return len;
 	}
-	ret = ea_get_unstuffed(ip, el->el_ea, data);
+	ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data);
 	if (ret < 0)
 		return ret;
 	return len;
@@ -727,7 +739,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		goto out_gunlock_q;
 
 	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + gfs2_rg_blocks(ip) +
+				 blks + gfs2_rg_blocks(ip, blks) +
 				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
 	if (error)
 		goto out_ipres;
@@ -1220,69 +1232,23 @@ static int gfs2_xattr_set(struct dentry *dentry, const char *name,
 				size, flags, type);
 }
 
+
 static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
 				  struct gfs2_ea_header *ea, char *data)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head **bh;
 	unsigned int amount = GFS2_EA_DATA_LEN(ea);
 	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
-	unsigned int x;
-	int error;
-
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!bh)
-		return -ENOMEM;
-
-	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
-	if (error)
-		goto out;
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
-				       bh + x);
-		if (error) {
-			while (x--)
-				brelse(bh[x]);
-			goto fail;
-		}
-		dataptrs++;
-	}
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_wait(sdp, bh[x]);
-		if (error) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			goto fail;
-		}
-		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			error = -EIO;
-			goto fail;
-		}
-
-		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
-
-		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
-		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
-
-		amount -= sdp->sd_jbsize;
-		data += sdp->sd_jbsize;
-
-		brelse(bh[x]);
-	}
+	int ret;
 
-out:
-	kfree(bh);
-	return error;
+	ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+	if (ret)
+		return ret;
 
-fail:
+	ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
 	gfs2_trans_end(sdp);
-	kfree(bh);
-	return error;
+
+	return ret;
 }
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 09357508ec9..a2862339323 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1113,6 +1113,11 @@ static void mark_journal_empty(journal_t *journal)
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 	spin_lock(&journal->j_state_lock);
+	/* Is it already empty? */
+	if (sb->s_start == 0) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
 	jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
         	  journal->j_tail_sequence);
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a74cb1725ac..7cc37ca19cd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -874,7 +874,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
 EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
 
 /**
- * generic_fh_to_dentry - generic helper for the fh_to_parent export operation
+ * generic_fh_to_parent - generic helper for the fh_to_parent export operation
  * @sb:		filesystem to do the file handle conversion on
  * @fid:	file handle to convert
  * @fh_len:	length of the file handle in bytes
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index fb1a2bedbe9..8d80c990dff 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref)
 	dprintk("lockd: freeing block %p...\n", block);
 
 	/* Remove block from file's list of blocks */
-	mutex_lock(&file->f_mutex);
 	list_del_init(&block->b_flist);
 	mutex_unlock(&file->f_mutex);
 
@@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref)
 static void nlmsvc_release_block(struct nlm_block *block)
 {
 	if (block != NULL)
-		kref_put(&block->b_count, nlmsvc_free_block);
+		kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex);
 }
 
 /*
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index df0de27c273..e784a217b50 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -26,6 +26,7 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	struct completion complete;
 
 	bio_init(&bio);
+	bio.bi_max_vecs = 1;
 	bio.bi_io_vec = &bio_vec;
 	bio_vec.bv_page = page;
 	bio_vec.bv_len = PAGE_SIZE;
@@ -95,12 +96,11 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
 	struct bio *bio;
 	struct page *page;
-	struct request_queue *q = bdev_get_queue(sb->s_bdev);
-	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	unsigned int max_pages;
 	int i;
 
-	if (max_pages > BIO_MAX_PAGES)
-		max_pages = BIO_MAX_PAGES;
+	max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+
 	bio = bio_alloc(GFP_NOFS, max_pages);
 	BUG_ON(!bio);
 
@@ -190,12 +190,11 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct bio *bio;
-	struct request_queue *q = bdev_get_queue(sb->s_bdev);
-	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	unsigned int max_pages;
 	int i;
 
-	if (max_pages > BIO_MAX_PAGES)
-		max_pages = BIO_MAX_PAGES;
+	max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+
 	bio = bio_alloc(GFP_NOFS, max_pages);
 	BUG_ON(!bio);
 
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index a422f42238b..6984562738d 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -156,10 +156,26 @@ static void __logfs_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 
+static void __logfs_destroy_meta_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	BUG_ON(li->li_block);
+	call_rcu(&inode->i_rcu, logfs_i_callback);
+}
+
 static void logfs_destroy_inode(struct inode *inode)
 {
 	struct logfs_inode *li = logfs_inode(inode);
 
+	if (inode->i_ino < LOGFS_RESERVED_INOS) {
+		/*
+		 * The reserved inodes are never destroyed unless we are in
+		 * unmont path.
+		 */
+		__logfs_destroy_meta_inode(inode);
+		return;
+	}
+
 	BUG_ON(list_empty(&li->li_freeing_list));
 	spin_lock(&logfs_inode_lock);
 	li->li_refcount--;
@@ -373,8 +389,8 @@ static void logfs_put_super(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
 	/* kill the meta-inodes */
-	iput(super->s_master_inode);
 	iput(super->s_segfile_inode);
+	iput(super->s_master_inode);
 	iput(super->s_mapping_inode);
 }
 
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 1e1c369df22..2a09b8d7398 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -565,7 +565,7 @@ static void write_wbuf(struct super_block *sb, struct logfs_area *area,
 	index = ofs >> PAGE_SHIFT;
 	page_ofs = ofs & (PAGE_SIZE - 1);
 
-	page = find_lock_page(mapping, index);
+	page = find_or_create_page(mapping, index, GFP_NOFS);
 	BUG_ON(!page);
 	memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
 	unlock_page(page);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index f1cb512c501..5be0abef603 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2189,7 +2189,6 @@ void logfs_evict_inode(struct inode *inode)
 		return;
 	}
 
-	BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
 	page = inode_to_page(inode);
 	BUG_ON(!page); /* FIXME: Use emergency page */
 	logfs_put_write_page(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index e28d090c98d..038da099179 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -886,7 +886,7 @@ static struct logfs_area *alloc_area(struct super_block *sb)
 
 static void map_invalidatepage(struct page *page, unsigned long l)
 {
-	BUG();
+	return;
 }
 
 static int map_releasepage(struct page *page, gfp_t g)
diff --git a/fs/namei.c b/fs/namei.c
index db76b866a09..dd1ed1b8e98 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -352,6 +352,7 @@ int __inode_permission(struct inode *inode, int mask)
 /**
  * sb_permission - Check superblock-level permissions
  * @sb: Superblock of inode to check permission on
+ * @inode: Inode to check permission on
  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Separate out file-system wide checks from inode-specific permission checks.
@@ -656,6 +657,7 @@ int sysctl_protected_hardlinks __read_mostly = 1;
 /**
  * may_follow_link - Check symlink following for unsafe situations
  * @link: The path of the symlink
+ * @nd: nameidata pathwalk data
  *
  * In the case of the sysctl_protected_symlinks sysctl being enabled,
  * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
diff --git a/fs/namespace.c b/fs/namespace.c
index 4d31f73e256..7bdf7907413 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 		return err;
 
 	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
-		goto unlock;
+	if (unlikely(!check_mnt(real_mount(path->mnt)))) {
+		/* that's acceptable only for automounts done in private ns */
+		if (!(mnt_flags & MNT_SHRINKABLE))
+			goto unlock;
+		/* ... and for those we'd better have mountpoint still alive */
+		if (!real_mount(path->mnt)->mnt_ns)
+			goto unlock;
+	}
 
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8bf3a3f6925..b7db60897f9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -12,19 +12,19 @@ nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_SYSCTL)	+= sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
 
-obj-$(CONFIG_NFS_V2) += nfs2.o
-nfs2-y := nfs2super.o proc.o nfs2xdr.o
+obj-$(CONFIG_NFS_V2) += nfsv2.o
+nfsv2-y := nfs2super.o proc.o nfs2xdr.o
 
-obj-$(CONFIG_NFS_V3) += nfs3.o
-nfs3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
-nfs3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
+obj-$(CONFIG_NFS_V3) += nfsv3.o
+nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
+nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
 
-obj-$(CONFIG_NFS_V4) += nfs4.o
-nfs4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
+obj-$(CONFIG_NFS_V4) += nfsv4.o
+nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
 	  delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
 	  nfs4namespace.o nfs4getroot.o nfs4client.o
-nfs4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o
-nfs4-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o
+nfsv4-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o
 
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9fc0d9dfc91..99694442b93 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
 
 	if (IS_ERR(nfs)) {
 		mutex_lock(&nfs_version_mutex);
-		request_module("nfs%d", version);
+		request_module("nfsv%d", version);
 		nfs = find_nfs_version(version);
 		mutex_unlock(&nfs_version_mutex);
 	}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 75d6d0a3d32..6a7fcab7ecb 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -287,10 +287,12 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret != 0)
+		goto out;
 	mutex_lock(&inode->i_mutex);
 	ret = nfs_file_fsync_commit(file, start, end, datasync);
 	mutex_unlock(&inode->i_mutex);
-
+out:
 	return ret;
 }
 
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index b701358c39c..a850079467d 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -61,6 +61,12 @@ struct idmap {
 	struct mutex		idmap_mutex;
 };
 
+struct idmap_legacy_upcalldata {
+	struct rpc_pipe_msg pipe_msg;
+	struct idmap_msg idmap_msg;
+	struct idmap *idmap;
+};
+
 /**
  * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
  * @fattr: fully initialised struct nfs_fattr
@@ -324,6 +330,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
 		ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
 					    name, namelen, type, data,
 					    data_size, idmap);
+		idmap->idmap_key_cons = NULL;
 		mutex_unlock(&idmap->idmap_mutex);
 	}
 	return ret;
@@ -380,11 +387,13 @@ static const match_table_t nfs_idmap_tokens = {
 static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
 				   size_t);
+static void idmap_release_pipe(struct inode *);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 
 static const struct rpc_pipe_ops idmap_upcall_ops = {
 	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= idmap_pipe_downcall,
+	.release_pipe	= idmap_release_pipe,
 	.destroy_msg	= idmap_pipe_destroy_msg,
 };
 
@@ -616,7 +625,8 @@ void nfs_idmap_quit(void)
 	nfs_idmap_quit_keyring();
 }
 
-static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im,
+static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
+				     struct idmap_msg *im,
 				     struct rpc_pipe_msg *msg)
 {
 	substring_t substr;
@@ -659,6 +669,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 				   const char *op,
 				   void *aux)
 {
+	struct idmap_legacy_upcalldata *data;
 	struct rpc_pipe_msg *msg;
 	struct idmap_msg *im;
 	struct idmap *idmap = (struct idmap *)aux;
@@ -666,15 +677,15 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 	int ret = -ENOMEM;
 
 	/* msg and im are freed in idmap_pipe_destroy_msg */
-	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
-	if (!msg)
-		goto out0;
-
-	im = kmalloc(sizeof(*im), GFP_KERNEL);
-	if (!im)
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
 		goto out1;
 
-	ret = nfs_idmap_prepare_message(key->description, im, msg);
+	msg = &data->pipe_msg;
+	im = &data->idmap_msg;
+	data->idmap = idmap;
+
+	ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
 	if (ret < 0)
 		goto out2;
 
@@ -683,15 +694,15 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 
 	ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
 	if (ret < 0)
-		goto out2;
+		goto out3;
 
 	return ret;
 
+out3:
+	idmap->idmap_key_cons = NULL;
 out2:
-	kfree(im);
+	kfree(data);
 out1:
-	kfree(msg);
-out0:
 	complete_request_key(cons, ret);
 	return ret;
 }
@@ -749,9 +760,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	}
 
 	if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
-		ret = mlen;
-		complete_request_key(cons, -ENOKEY);
-		goto out_incomplete;
+		ret = -ENOKEY;
+		goto out;
 	}
 
 	namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
@@ -768,16 +778,32 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 
 out:
 	complete_request_key(cons, ret);
-out_incomplete:
 	return ret;
 }
 
 static void
 idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 {
+	struct idmap_legacy_upcalldata *data = container_of(msg,
+			struct idmap_legacy_upcalldata,
+			pipe_msg);
+	struct idmap *idmap = data->idmap;
+	struct key_construction *cons;
+	if (msg->errno) {
+		cons = ACCESS_ONCE(idmap->idmap_key_cons);
+		idmap->idmap_key_cons = NULL;
+		complete_request_key(cons, msg->errno);
+	}
 	/* Free memory allocated in nfs_idmap_legacy_upcall() */
-	kfree(msg->data);
-	kfree(msg);
+	kfree(data);
+}
+
+static void
+idmap_release_pipe(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	struct idmap *idmap = (struct idmap *)rpci->private;
+	idmap->idmap_key_cons = NULL;
 }
 
 int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c6e895f0fbf..9b47610338f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -154,7 +154,7 @@ static void nfs_zap_caches_locked(struct inode *inode)
 	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	nfsi->attrtimeo_timestamp = jiffies;
 
-	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
+	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 	else
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0952c791df3..69322096c32 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -69,7 +69,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
 	nfs_fattr_init(info->fattr);
 	status = rpc_call_sync(client, &msg, 0);
 	dprintk("%s: reply fsinfo: %d\n", __func__, status);
-	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
+	if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) {
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
 		msg.rpc_resp = info->fattr;
 		status = rpc_call_sync(client, &msg, 0);
@@ -643,7 +643,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		  u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
-	__be32			*verf = NFS_COOKIEVERF(dir);
+	__be32			*verf = NFS_I(dir)->cookieverf;
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
 		.cookie		= cookie,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3b950dd81e8..da0618aeead 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -205,6 +205,9 @@ extern const struct dentry_operations nfs4_dentry_operations;
 int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
 		    unsigned, umode_t, int *);
 
+/* super.c */
+extern struct file_system_type nfs4_fs_type;
+
 /* nfs4namespace.c */
 rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index cbcdfaf3250..24eb663f8ed 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -74,7 +74,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
 	return clp;
 
 error:
-	kfree(clp);
+	nfs_free_client(clp);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index acb65e7887f..eb5eb8eef4d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -96,13 +96,15 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret != 0)
+		goto out;
 	mutex_lock(&inode->i_mutex);
 	ret = nfs_file_fsync_commit(file, start, end, datasync);
 	if (!ret && !datasync)
 		/* application has asked for meta-data sync */
 		ret = pnfs_layoutcommit_inode(inode, true);
 	mutex_unlock(&inode->i_mutex);
-
+out:
 	return ret;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a99a8d94872..1e50326d00d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3215,11 +3215,11 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name,
 			(unsigned long long)cookie);
-	nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
+	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
 	res.pgbase = args.pgbase;
 	status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
 	if (status >= 0) {
-		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
 		status += args.pgbase;
 	}
 
@@ -3653,11 +3653,11 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
 		&& (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
 }
 
-/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that
- * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on
+/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
+ * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on
  * the stack.
  */
-#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
+#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
 
 static int buf_to_pages_noslab(const void *buf, size_t buflen,
 		struct page **pages, unsigned int *pgbase)
@@ -3668,7 +3668,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen,
 	spages = pages;
 
 	do {
-		len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
+		len = min_t(size_t, PAGE_SIZE, buflen);
 		newpage = alloc_page(GFP_KERNEL);
 
 		if (newpage == NULL)
@@ -3737,9 +3737,10 @@ out:
 static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)
 {
 	struct nfs4_cached_acl *acl;
+	size_t buflen = sizeof(*acl) + acl_len;
 
-	if (pages && acl_len <= PAGE_SIZE) {
-		acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL);
+	if (buflen <= PAGE_SIZE) {
+		acl = kmalloc(buflen, GFP_KERNEL);
 		if (acl == NULL)
 			goto out;
 		acl->cached = 1;
@@ -3781,17 +3782,15 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	int ret = -ENOMEM, npages, i;
-	size_t acl_len = 0;
+	unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
+	int ret = -ENOMEM, i;
 
-	npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	/* As long as we're doing a round trip to the server anyway,
 	 * let's be prepared for a page of acl data. */
 	if (npages == 0)
 		npages = 1;
-
-	/* Add an extra page to handle the bitmap returned */
-	npages++;
+	if (npages > ARRAY_SIZE(pages))
+		return -ERANGE;
 
 	for (i = 0; i < npages; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3807,11 +3806,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 	args.acl_len = npages * PAGE_SIZE;
 	args.acl_pgbase = 0;
 
-	/* Let decode_getfacl know not to fail if the ACL data is larger than
-	 * the page we send as a guess */
-	if (buf == NULL)
-		res.acl_flags |= NFS4_ACL_LEN_REQUEST;
-
 	dprintk("%s  buf %p buflen %zu npages %d args.acl_len %zu\n",
 		__func__, buf, buflen, npages, args.acl_len);
 	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
@@ -3819,20 +3813,19 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 	if (ret)
 		goto out_free;
 
-	acl_len = res.acl_len - res.acl_data_offset;
-	if (acl_len > args.acl_len)
-		nfs4_write_cached_acl(inode, NULL, 0, acl_len);
-	else
-		nfs4_write_cached_acl(inode, pages, res.acl_data_offset,
-				      acl_len);
-	if (buf) {
+	/* Handle the case where the passed-in buffer is too short */
+	if (res.acl_flags & NFS4_ACL_TRUNC) {
+		/* Did the user only issue a request for the acl length? */
+		if (buf == NULL)
+			goto out_ok;
 		ret = -ERANGE;
-		if (acl_len > buflen)
-			goto out_free;
-		_copy_from_pages(buf, pages, res.acl_data_offset,
-				acl_len);
+		goto out_free;
 	}
-	ret = acl_len;
+	nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
+	if (buf)
+		_copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+out_ok:
+	ret = res.acl_len;
 out_free:
 	for (i = 0; i < npages; i++)
 		if (pages[i])
@@ -3890,10 +3883,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
+	unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
 	int ret, i;
 
 	if (!nfs4_server_supports_acls(server))
 		return -EOPNOTSUPP;
+	if (npages > ARRAY_SIZE(pages))
+		return -ERANGE;
 	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	if (i < 0)
 		return i;
@@ -6223,11 +6219,58 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	dprintk("<-- %s\n", __func__);
 }
 
+static size_t max_response_pages(struct nfs_server *server)
+{
+	u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	return nfs_page_array_len(0, max_resp_sz);
+}
+
+static void nfs4_free_pages(struct page **pages, size_t size)
+{
+	int i;
+
+	if (!pages)
+		return;
+
+	for (i = 0; i < size; i++) {
+		if (!pages[i])
+			break;
+		__free_page(pages[i]);
+	}
+	kfree(pages);
+}
+
+static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kcalloc(size, sizeof(struct page *), gfp_flags);
+	if (!pages) {
+		dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
+		return NULL;
+	}
+
+	for (i = 0; i < size; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i]) {
+			dprintk("%s: failed to allocate page\n", __func__);
+			nfs4_free_pages(pages, size);
+			return NULL;
+		}
+	}
+
+	return pages;
+}
+
 static void nfs4_layoutget_release(void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	size_t max_pages = max_response_pages(server);
 
 	dprintk("--> %s\n", __func__);
+	nfs4_free_pages(lgp->args.layout.pages, max_pages);
 	put_nfs_open_context(lgp->args.ctx);
 	kfree(calldata);
 	dprintk("<-- %s\n", __func__);
@@ -6239,9 +6282,10 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 	.rpc_release = nfs4_layoutget_release,
 };
 
-int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 {
 	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	size_t max_pages = max_response_pages(server);
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
@@ -6259,12 +6303,19 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
 
 	dprintk("--> %s\n", __func__);
 
+	lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
+	if (!lgp->args.layout.pages) {
+		nfs4_layoutget_release(lgp);
+		return;
+	}
+	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+
 	lgp->res.layoutp = &lgp->args.layout;
 	lgp->res.seq_res.sr_slot = NULL;
 	nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
-		return PTR_ERR(task);
+		return;
 	status = nfs4_wait_for_completion_rpc_task(task);
 	if (status == 0)
 		status = task->tk_status;
@@ -6272,7 +6323,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
 		status = pnfs_layout_process(lgp);
 	rpc_put_task(task);
 	dprintk("<-- %s status=%d\n", __func__, status);
-	return status;
+	return;
 }
 
 static void
@@ -6304,12 +6355,8 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 		return;
 	}
 	spin_lock(&lo->plh_inode->i_lock);
-	if (task->tk_status == 0) {
-		if (lrp->res.lrs_present) {
-			pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-		} else
-			BUG_ON(!list_empty(&lo->plh_segs));
-	}
+	if (task->tk_status == 0 && lrp->res.lrs_present)
+		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
 	lo->plh_block_lgets--;
 	spin_unlock(&lo->plh_inode->i_lock);
 	dprintk("<-- %s\n", __func__);
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 12a31a9dbcd..bd61221ad2c 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -23,14 +23,6 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
 static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data);
 
-static struct file_system_type nfs4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs4",
-	.mount		= nfs_fs_mount,
-	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-
 static struct file_system_type nfs4_remote_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
@@ -344,14 +336,8 @@ static int __init init_nfs_v4(void)
 	if (err)
 		goto out1;
 
-	err = register_filesystem(&nfs4_fs_type);
-	if (err < 0)
-		goto out2;
-
 	register_nfs_version(&nfs_v4);
 	return 0;
-out2:
-	nfs4_unregister_sysctl();
 out1:
 	nfs_idmap_quit();
 out:
@@ -361,7 +347,6 @@ out:
 static void __exit exit_nfs_v4(void)
 {
 	unregister_nfs_version(&nfs_v4);
-	unregister_filesystem(&nfs4_fs_type);
 	nfs4_unregister_sysctl();
 	nfs_idmap_quit();
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ca13483edd6..8dba6bd4855 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5045,22 +5045,19 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 			 struct nfs_getaclres *res)
 {
 	unsigned int savep;
-	__be32 *bm_p;
 	uint32_t attrlen,
 		 bitmap[3] = {0};
 	int status;
-	size_t page_len = xdr->buf->page_len;
+	unsigned int pg_offset;
 
 	res->acl_len = 0;
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
 		goto out;
 
-	bm_p = xdr->p;
-	res->acl_data_offset = be32_to_cpup(bm_p) + 2;
-	res->acl_data_offset <<= 2;
-	/* Check if the acl data starts beyond the allocated buffer */
-	if (res->acl_data_offset > page_len)
-		return -ERANGE;
+	xdr_enter_page(xdr, xdr->buf->page_len);
+
+	/* Calculate the offset of the page data */
+	pg_offset = xdr->buf->head[0].iov_len;
 
 	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto out;
@@ -5074,23 +5071,16 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 		/* The bitmap (xdr len + bitmaps) and the attr xdr len words
 		 * are stored with the acl data to handle the problem of
 		 * variable length bitmaps.*/
-		xdr->p = bm_p;
-
-		/* We ignore &savep and don't do consistency checks on
-		 * the attr length.  Let userspace figure it out.... */
-		attrlen += res->acl_data_offset;
-		if (attrlen > page_len) {
-			if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
-				/* getxattr interface called with a NULL buf */
-				res->acl_len = attrlen;
-				goto out;
-			}
-			dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
-					attrlen, page_len);
-			return -EINVAL;
-		}
-		xdr_read_pages(xdr, attrlen);
+		res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset;
 		res->acl_len = attrlen;
+
+		/* Check for receive buffer overflow */
+		if (res->acl_len > (xdr->nwords << 2) ||
+		    res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
+			res->acl_flags |= NFS4_ACL_TRUNC;
+			dprintk("NFS: acl reply: attrlen %u > page_len %u\n",
+					attrlen, xdr->nwords << 2);
+		}
 	} else
 		status = -EOPNOTSUPP;
 
@@ -6235,7 +6225,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	if (decode_getfh(xdr, &res->fh) != 0)
+	status = decode_getfh(xdr, &res->fh);
+	if (status)
 		goto out;
 	decode_getfattr(xdr, res->f_attr, res->server);
 out:
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index f50d3e8d6f2..ea6d111b03e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -570,17 +570,66 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
 		return false;
 
 	return pgio->pg_count + req->wb_bytes <=
-			OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+			(unsigned long)pgio->pg_layout_private;
+}
+
+void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	pnfs_generic_pg_init_read(pgio, req);
+	if (unlikely(pgio->pg_lseg == NULL))
+		return; /* Not pNFS */
+
+	pgio->pg_layout_private = (void *)
+				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+}
+
+static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
+				   unsigned long *stripe_end)
+{
+	u32 stripe_off;
+	unsigned stripe_size;
+
+	if (layout->raid_algorithm == PNFS_OSD_RAID_0)
+		return true;
+
+	stripe_size = layout->stripe_unit *
+				(layout->group_width - layout->parity);
+
+	div_u64_rem(offset, stripe_size, &stripe_off);
+	if (!stripe_off)
+		return true;
+
+	*stripe_end = stripe_size - stripe_off;
+	return false;
+}
+
+void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+	unsigned long stripe_end = 0;
+
+	pnfs_generic_pg_init_write(pgio, req);
+	if (unlikely(pgio->pg_lseg == NULL))
+		return; /* Not pNFS */
+
+	if (req->wb_offset ||
+	    !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
+			       &OBJIO_LSEG(pgio->pg_lseg)->layout,
+			       &stripe_end)) {
+		pgio->pg_layout_private = (void *)stripe_end;
+	} else {
+		pgio->pg_layout_private = (void *)
+				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+	}
 }
 
 static const struct nfs_pageio_ops objio_pg_read_ops = {
-	.pg_init = pnfs_generic_pg_init_read,
+	.pg_init = objio_init_read,
 	.pg_test = objio_pg_test,
 	.pg_doio = pnfs_generic_pg_readpages,
 };
 
 static const struct nfs_pageio_ops objio_pg_write_ops = {
-	.pg_init = pnfs_generic_pg_init_write,
+	.pg_init = objio_init_write,
 	.pg_test = objio_pg_test,
 	.pg_doio = pnfs_generic_pg_writepages,
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1a6732ed04a..311a79681e2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -49,6 +49,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 	hdr->io_start = req_offset(hdr->req);
 	hdr->good_bytes = desc->pg_count;
 	hdr->dreq = desc->pg_dreq;
+	hdr->layout_private = desc->pg_layout_private;
 	hdr->release = release;
 	hdr->completion_ops = desc->pg_completion_ops;
 	if (hdr->completion_ops->init_hdr)
@@ -268,6 +269,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
 	desc->pg_dreq = NULL;
+	desc->pg_layout_private = NULL;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 76875bfcf19..2e00feacd4b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -583,9 +583,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	struct nfs_server *server = NFS_SERVER(ino);
 	struct nfs4_layoutget *lgp;
 	struct pnfs_layout_segment *lseg = NULL;
-	struct page **pages = NULL;
-	int i;
-	u32 max_resp_sz, max_pages;
 
 	dprintk("--> %s\n", __func__);
 
@@ -594,20 +591,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	if (lgp == NULL)
 		return NULL;
 
-	/* allocate pages for xdr post processing */
-	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-	max_pages = nfs_page_array_len(0, max_resp_sz);
-
-	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
-	if (!pages)
-		goto out_err_free;
-
-	for (i = 0; i < max_pages; i++) {
-		pages[i] = alloc_page(gfp_flags);
-		if (!pages[i])
-			goto out_err_free;
-	}
-
 	lgp->args.minlength = PAGE_CACHE_SIZE;
 	if (lgp->args.minlength > range->length)
 		lgp->args.minlength = range->length;
@@ -616,39 +599,19 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	lgp->args.type = server->pnfs_curr_ld->id;
 	lgp->args.inode = ino;
 	lgp->args.ctx = get_nfs_open_context(ctx);
-	lgp->args.layout.pages = pages;
-	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
 	lgp->lsegpp = &lseg;
 	lgp->gfp_flags = gfp_flags;
 
 	/* Synchronously retrieve layout information from server and
 	 * store in lseg.
 	 */
-	nfs4_proc_layoutget(lgp);
+	nfs4_proc_layoutget(lgp, gfp_flags);
 	if (!lseg) {
 		/* remember that LAYOUTGET failed and suspend trying */
 		set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
 	}
 
-	/* free xdr pages */
-	for (i = 0; i < max_pages; i++)
-		__free_page(pages[i]);
-	kfree(pages);
-
 	return lseg;
-
-out_err_free:
-	/* free any allocated xdr pages, lgp as it's not used */
-	if (pages) {
-		for (i = 0; i < max_pages; i++) {
-			if (!pages[i])
-				break;
-			__free_page(pages[i]);
-		}
-		kfree(pages);
-	}
-	kfree(lgp);
-	return NULL;
 }
 
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 2c6c80503ba..745aa1b39e7 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -172,7 +172,7 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
 				   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev);
-extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 
 /* pnfs.c */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ac6a3c55dce..d2c7f5db084 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -319,6 +319,34 @@ EXPORT_SYMBOL_GPL(nfs_sops);
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
 static int nfs4_validate_mount_data(void *options,
 	struct nfs_parsed_mount_data *args, const char *dev_name);
+
+struct file_system_type nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.mount		= nfs_fs_mount,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+EXPORT_SYMBOL_GPL(nfs4_fs_type);
+
+static int __init register_nfs4_fs(void)
+{
+	return register_filesystem(&nfs4_fs_type);
+}
+
+static void unregister_nfs4_fs(void)
+{
+	unregister_filesystem(&nfs4_fs_type);
+}
+#else
+static int __init register_nfs4_fs(void)
+{
+	return 0;
+}
+
+static void unregister_nfs4_fs(void)
+{
+}
 #endif
 
 static struct shrinker acl_shrinker = {
@@ -337,12 +365,18 @@ int __init register_nfs_fs(void)
 	if (ret < 0)
 		goto error_0;
 
-	ret = nfs_register_sysctl();
+	ret = register_nfs4_fs();
 	if (ret < 0)
 		goto error_1;
+
+	ret = nfs_register_sysctl();
+	if (ret < 0)
+		goto error_2;
 	register_shrinker(&acl_shrinker);
 	return 0;
 
+error_2:
+	unregister_nfs4_fs();
 error_1:
 	unregister_filesystem(&nfs_fs_type);
 error_0:
@@ -356,6 +390,7 @@ void __exit unregister_nfs_fs(void)
 {
 	unregister_shrinker(&acl_shrinker);
 	nfs_unregister_sysctl();
+	unregister_nfs4_fs();
 	unregister_filesystem(&nfs_fs_type);
 }
 
@@ -1502,7 +1537,7 @@ static int nfs_parse_mount_options(char *raw,
 
 	/*
 	 * verify that any proto=/mountproto= options match the address
-	 * familiies in the addr=/mountaddr= options.
+	 * families in the addr=/mountaddr= options.
 	 */
 	if (protofamily != AF_UNSPEC &&
 	    protofamily != mnt->nfs_server.address.ss_family)
@@ -1832,6 +1867,7 @@ static int nfs23_validate_mount_data(void *options,
 
 		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
+		args->nfs_server.port = ntohs(data->addr.sin_port);
 		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
@@ -2529,6 +2565,7 @@ static int nfs4_validate_mount_data(void *options,
 			return -EFAULT;
 		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
+		args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
 
 		if (data->auth_flavourlen) {
 			if (data->auth_flavourlen > 1)
@@ -2645,4 +2682,6 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
 		"Send implementation ID with NFSv4.1 exchange_id");
+MODULE_ALIAS("nfs4");
+
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5829d0ce7cf..e3b55372726 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1814,19 +1814,19 @@ int __init nfs_init_writepagecache(void)
 	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
 						     nfs_wdata_cachep);
 	if (nfs_wdata_mempool == NULL)
-		return -ENOMEM;
+		goto out_destroy_write_cache;
 
 	nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
 					     sizeof(struct nfs_commit_data),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_cdata_cachep == NULL)
-		return -ENOMEM;
+		goto out_destroy_write_mempool;
 
 	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
 						      nfs_wdata_cachep);
 	if (nfs_commit_mempool == NULL)
-		return -ENOMEM;
+		goto out_destroy_commit_cache;
 
 	/*
 	 * NFS congestion size, scale with available memory.
@@ -1849,11 +1849,20 @@ int __init nfs_init_writepagecache(void)
 		nfs_congestion_kb = 256*1024;
 
 	return 0;
+
+out_destroy_commit_cache:
+	kmem_cache_destroy(nfs_cdata_cachep);
+out_destroy_write_mempool:
+	mempool_destroy(nfs_wdata_mempool);
+out_destroy_write_cache:
+	kmem_cache_destroy(nfs_wdata_cachep);
+	return -ENOMEM;
 }
 
 void nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
+	kmem_cache_destroy(nfs_cdata_cachep);
 	mempool_destroy(nfs_wdata_mempool);
 	kmem_cache_destroy(nfs_wdata_cachep);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cbaf4f8bb7b..4c7bd35b187 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -651,12 +651,12 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
 
 	if (clp->cl_minorversion == 0) {
 		if (!clp->cl_cred.cr_principal &&
-				(clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+				(clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
 			return -EINVAL;
 		args.client_name = clp->cl_cred.cr_principal;
 		args.prognumber	= conn->cb_prog,
 		args.protocol = XPRT_TRANSPORT_TCP;
-		args.authflavor = clp->cl_flavor;
+		args.authflavor = clp->cl_cred.cr_flavor;
 		clp->cl_cb_ident = conn->cb_ident;
 	} else {
 		if (!conn->cb_xprt)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e6173147f98..22bd0a66c35 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -231,7 +231,6 @@ struct nfs4_client {
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
 	struct sockaddr_storage	cl_addr; 	/* client ipaddress */
-	u32			cl_flavor;	/* setclientid pseudoflavor */
 	struct svc_cred		cl_cred; 	/* setclientid principal */
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dfafeb2b05a..eb7cc91b725 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -462,9 +462,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 
 	err = ERR_PTR(-ENOMEM);
 	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
-	if (h)
-		sysctl_head_finish(h);
-
 	if (!inode)
 		goto out;
 
@@ -473,6 +470,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 	d_add(dentry, inode);
 
 out:
+	if (h)
+		sysctl_head_finish(h);
 	sysctl_head_finish(head);
 	return err;
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 36a29b753c7..c495a3055e2 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1589,10 +1589,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 		goto out;
 	}
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
 
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!dquots[cnt])
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4c0c7d163d1..a98b7740a0f 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1334,9 +1334,7 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
 	else if (bitmap == 0)
 		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
 
-	reiserfs_write_unlock(sb);
 	bh = sb_bread(sb, block);
-	reiserfs_write_lock(sb);
 	if (bh == NULL)
 		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
 		                 "reading failed", __func__, block);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6d4268fb6c..855da58db14 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -76,10 +76,10 @@ void reiserfs_evict_inode(struct inode *inode)
 		;
 	}
       out:
+	reiserfs_write_unlock_once(inode->i_sb, depth);
 	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
 	dquot_drop(inode);
 	inode->i_blocks = 0;
-	reiserfs_write_unlock_once(inode->i_sb, depth);
 	return;
 
 no_delete:
diff --git a/fs/stat.c b/fs/stat.c
index b6ff11825fc..40780229a03 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL(vfs_getattr);
 int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
 	int fput_needed;
-	struct file *f = fget_light(fd, &fput_needed);
+	struct file *f = fget_raw_light(fd, &fput_needed);
 	int error = -EBADF;
 
 	if (f) {
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8b8cc4e945f..760de723dad 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -167,7 +167,7 @@ struct ubifs_global_debug_info {
 #define ubifs_dbg_msg(type, fmt, ...) \
 	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
 
-#define DBG_KEY_BUF_LEN 32
+#define DBG_KEY_BUF_LEN 48
 #define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
 	char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
 	pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__,             \
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ce33b2beb15..8640920766e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1749,7 +1749,10 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
 	return 0;
 
 out_err:
-	ubifs_lpt_free(c, 0);
+	if (wr)
+		ubifs_lpt_free(c, 1);
+	if (rd)
+		ubifs_lpt_free(c, 0);
 	return err;
 }
 
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c30d976b4be..edeec499c04 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -788,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 
 corrupted_rescan:
 	/* Re-scan the corrupted data with verbose messages */
-	ubifs_err("corruptio %d", ret);
+	ubifs_err("corruption %d", ret);
 	ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
 	ubifs_scanned_corruption(c, lnum, offs, buf);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index eba46d4a761..94d78fc5d4e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1026,7 +1026,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
 	c->replaying = 1;
 	lnum = c->ltail_lnum = c->lhead_lnum;
 
-	lnum = UBIFS_LOG_LNUM;
 	do {
 		err = replay_log_leb(c, lnum, 0, c->sbuf);
 		if (err == 1)
@@ -1035,7 +1034,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
 		if (err)
 			goto out;
 		lnum = ubifs_next_log_lnum(c, lnum);
-	} while (lnum != UBIFS_LOG_LNUM);
+	} while (lnum != c->ltail_lnum);
 
 	err = replay_buds(c);
 	if (err)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c3fa6c5327a..71a197f0f93 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1157,9 +1157,6 @@ static int check_free_space(struct ubifs_info *c)
  *
  * This function mounts UBIFS file system. Returns zero in case of success and
  * a negative error code in case of failure.
- *
- * Note, the function does not de-allocate resources it it fails half way
- * through, and the caller has to do this instead.
  */
 static int mount_ubifs(struct ubifs_info *c)
 {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7f3f7ba3df6..d1c6093fd3d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -39,20 +39,24 @@
 #include "udf_i.h"
 #include "udf_sb.h"
 
-static int udf_adinicb_readpage(struct file *file, struct page *page)
+static void __udf_adinicb_readpage(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	char *kaddr;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
-	BUG_ON(!PageLocked(page));
-
 	kaddr = kmap(page);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
 	memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
+	memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
 	flush_dcache_page(page);
 	SetPageUptodate(page);
 	kunmap(page);
+}
+
+static int udf_adinicb_readpage(struct file *file, struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	__udf_adinicb_readpage(page);
 	unlock_page(page);
 
 	return 0;
@@ -77,6 +81,25 @@ static int udf_adinicb_writepage(struct page *page,
 	return 0;
 }
 
+static int udf_adinicb_write_begin(struct file *file,
+			struct address_space *mapping, loff_t pos,
+			unsigned len, unsigned flags, struct page **pagep,
+			void **fsdata)
+{
+	struct page *page;
+
+	if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
+		return -EIO;
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
+		__udf_adinicb_readpage(page);
+	return 0;
+}
+
 static int udf_adinicb_write_end(struct file *file,
 			struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
@@ -98,8 +121,8 @@ static int udf_adinicb_write_end(struct file *file,
 const struct address_space_operations udf_adinicb_aops = {
 	.readpage	= udf_adinicb_readpage,
 	.writepage	= udf_adinicb_writepage,
-	.write_begin = simple_write_begin,
-	.write_end = udf_adinicb_write_end,
+	.write_begin	= udf_adinicb_write_begin,
+	.write_end	= udf_adinicb_write_end,
 };
 
 static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fafaad795cd..aa233469b3c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1124,14 +1124,17 @@ int udf_setsize(struct inode *inode, loff_t newsize)
 				if (err)
 					return err;
 				down_write(&iinfo->i_data_sem);
-			} else
+			} else {
 				iinfo->i_lenAlloc = newsize;
+				goto set_size;
+			}
 		}
 		err = udf_extend_file(inode, newsize);
 		if (err) {
 			up_write(&iinfo->i_data_sem);
 			return err;
 		}
+set_size:
 		truncate_setsize(inode, newsize);
 		up_write(&iinfo->i_data_sem);
 	} else {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index dcbf98722af..18fc038a438 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1344,6 +1344,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 		udf_err(sb, "error loading logical volume descriptor: "
 			"Partition table too long (%u > %lu)\n", table_len,
 			sb->s_blocksize - sizeof(*lvd));
+		ret = 1;
 		goto out_bh;
 	}
 
@@ -1388,8 +1389,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 						UDF_ID_SPARABLE,
 						strlen(UDF_ID_SPARABLE))) {
 				if (udf_load_sparable_map(sb, map,
-				    (struct sparablePartitionMap *)gpm) < 0)
+				    (struct sparablePartitionMap *)gpm) < 0) {
+					ret = 1;
 					goto out_bh;
+				}
 			} else if (!strncmp(upm2->partIdent.ident,
 						UDF_ID_METADATA,
 						strlen(UDF_ID_METADATA))) {
@@ -2000,6 +2003,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 			if (!silent)
 				pr_notice("Rescanning with blocksize %d\n",
 					  UDF_DEFAULT_BLOCKSIZE);
+			brelse(sbi->s_lvid_bh);
+			sbi->s_lvid_bh = NULL;
 			uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
 			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d7a9dd735e1..933b7930b86 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -96,6 +96,7 @@ xfs_buf_lru_add(
 		atomic_inc(&bp->b_hold);
 		list_add_tail(&bp->b_lru, &btp->bt_lru);
 		btp->bt_lru_nr++;
+		bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
 	}
 	spin_unlock(&btp->bt_lru_lock);
 }
@@ -154,7 +155,8 @@ xfs_buf_stale(
 		struct xfs_buftarg *btp = bp->b_target;
 
 		spin_lock(&btp->bt_lru_lock);
-		if (!list_empty(&bp->b_lru)) {
+		if (!list_empty(&bp->b_lru) &&
+		    !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
 			list_del_init(&bp->b_lru);
 			btp->bt_lru_nr--;
 			atomic_dec(&bp->b_hold);
@@ -1501,6 +1503,7 @@ xfs_buftarg_shrink(
 		 */
 		list_move(&bp->b_lru, &dispose);
 		btp->bt_lru_nr--;
+		bp->b_lru_flags |= _XBF_LRU_DISPOSE;
 	}
 	spin_unlock(&btp->bt_lru_lock);
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d03b73b9604..7c0b6a0a155 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -38,27 +38,28 @@ typedef enum {
 	XBRW_ZERO = 3,			/* Zero target memory */
 } xfs_buf_rw_t;
 
-#define XBF_READ	(1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE	(1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD	(1 << 2) /* asynchronous read-ahead */
-#define XBF_ASYNC	(1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE	(1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE	(1 << 6) /* buffer has been staled, do not find it */
+#define XBF_READ	 (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE	 (1 << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD	 (1 << 2) /* asynchronous read-ahead */
+#define XBF_ASYNC	 (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE	 (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE	 (1 << 6) /* buffer has been staled, do not find it */
 
 /* I/O hints for the BIO layer */
-#define XBF_SYNCIO	(1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA		(1 << 11)/* force cache write through mode */
-#define XBF_FLUSH	(1 << 12)/* flush the disk cache before a write */
+#define XBF_SYNCIO	 (1 << 10)/* treat this buffer as synchronous I/O */
+#define XBF_FUA		 (1 << 11)/* force cache write through mode */
+#define XBF_FLUSH	 (1 << 12)/* flush the disk cache before a write */
 
 /* flags used only as arguments to access routines */
-#define XBF_TRYLOCK	(1 << 16)/* lock requested, but do not wait */
-#define XBF_UNMAPPED	(1 << 17)/* do not map the buffer */
+#define XBF_TRYLOCK	 (1 << 16)/* lock requested, but do not wait */
+#define XBF_UNMAPPED	 (1 << 17)/* do not map the buffer */
 
 /* flags used only internally */
-#define _XBF_PAGES	(1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM	(1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q	(1 << 22)/* buffer on a delwri queue */
-#define _XBF_COMPOUND	(1 << 23)/* compound buffer */
+#define _XBF_PAGES	 (1 << 20)/* backed by refcounted pages */
+#define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
+#define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
+#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
 
 typedef unsigned int xfs_buf_flags_t;
 
@@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_SYNCIO,		"SYNCIO" }, \
 	{ XBF_FUA,		"FUA" }, \
 	{ XBF_FLUSH,		"FLUSH" }, \
-	{ XBF_TRYLOCK,		"TRYLOCK" }, 	/* should never be set */\
+	{ XBF_TRYLOCK,		"TRYLOCK" },	/* should never be set */\
 	{ XBF_UNMAPPED,		"UNMAPPED" },	/* ditto */\
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_COMPOUND,	"COMPOUND" }
+	{ _XBF_COMPOUND,	"COMPOUND" }, \
+	{ _XBF_LRU_DISPOSE,	"LRU_DISPOSE" }
 
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
@@ -124,7 +126,12 @@ typedef struct xfs_buf {
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
 
+	/*
+	 * concurrent access to b_lru and b_lru_flags are protected by
+	 * bt_lru_lock and not by b_sema
+	 */
 	struct list_head	b_lru;		/* lru list */
+	xfs_buf_flags_t		b_lru_flags;	/* internal lru status flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index f9c3fe304a1..69cf4fcde03 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -179,12 +179,14 @@ xfs_ioc_trim(
 	 * used by the fstrim application.  In the end it really doesn't
 	 * matter as trimming blocks is an advisory interface.
 	 */
+	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+		return -XFS_ERROR(EINVAL);
+
 	start = BTOBB(range.start);
 	end = start + BTOBBT(range.len) - 1;
 	minlen = BTOBB(max_t(u64, granularity, range.minlen));
 
-	if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
-		return -XFS_ERROR(EINVAL);
 	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
 		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
 
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 21e37b55f7e..5aceb3f8ecd 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -962,23 +962,22 @@ xfs_dialloc(
 		if (!pag->pagi_freecount && !okalloc)
 			goto nextag;
 
+		/*
+		 * Then read in the AGI buffer and recheck with the AGI buffer
+		 * lock held.
+		 */
 		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
 		if (error)
 			goto out_error;
 
-		/*
-		 * Once the AGI has been read in we have to recheck
-		 * pagi_freecount with the AGI buffer lock held.
-		 */
 		if (pag->pagi_freecount) {
 			xfs_perag_put(pag);
 			goto out_alloc;
 		}
 
-		if (!okalloc) {
-			xfs_trans_brelse(tp, agbp);
-			goto nextag;
-		}
+		if (!okalloc)
+			goto nextag_relse_buffer;
+
 
 		error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
 		if (error) {
@@ -1007,6 +1006,8 @@ xfs_dialloc(
 			return 0;
 		}
 
+nextag_relse_buffer:
+		xfs_trans_brelse(tp, agbp);
 nextag:
 		xfs_perag_put(pag);
 		if (++agno == mp->m_sb.sb_agcount)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 92d4331cd4f..ca28a4ba4b5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -857,7 +857,7 @@ xfs_rtbuf_get(
 	xfs_buf_t	*bp;		/* block buffer, result */
 	xfs_inode_t	*ip;		/* bitmap or summary inode */
 	xfs_bmbt_irec_t	map;
-	int		nmap;
+	int		nmap = 1;
 	int		error;		/* error value */
 
 	ip = issum ? mp->m_rsumip : mp->m_rbmip;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bdaf4cb9f4a..19e2380fb86 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -919,6 +919,7 @@ xfs_fs_put_super(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_filestream_unmount(mp);
+	cancel_delayed_work_sync(&mp->m_sync_work);
 	xfs_unmountfs(mp);
 	xfs_syncd_stop(mp);
 	xfs_freesb(mp);