From e02119d5a7b4396c5a872582fddc8bd6d305a70a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:13:11 -0400
Subject: Btrfs: Add a write ahead tree log to optimize synchronous operations

File syncs and directory syncs are optimized by copying their
items into a special (copy-on-write) log tree.  There is one log tree per
subvolume and the btrfs super block points to a tree of log tree roots.

After a crash, items are copied out of the log tree and back into the
subvolume.  See tree-log.c for all the details.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 2804 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2804 insertions(+)
 create mode 100644 fs/btrfs/tree-log.c

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 00000000000..d1ce8314b94
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2804 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots.  This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root = root;
+	int ret;
+	u64 objectid = root->root_key.objectid;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      BTRFS_TREE_LOG_OBJECTID,
+				      0, 0, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		return ret;
+	}
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 0);
+	btrfs_set_root_used(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, 0);
+
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+					       &key);
+	BUG_ON(!new_root);
+
+	WARN_ON(root->log_root);
+	root->log_root = new_root;
+
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
+	new_root->ref_cows = 0;
+	new_root->last_trans = trans->transid;
+fail:
+	return ret;
+}
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
+{
+	int ret;
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (!root->fs_info->log_root_tree) {
+		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+		BUG_ON(ret);
+	}
+	if (!root->log_root) {
+		ret = btrfs_add_log_tree(trans, root);
+		BUG_ON(ret);
+	}
+	atomic_inc(&root->fs_info->tree_log_writers);
+	root->fs_info->tree_log_batch++;
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return 0;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	smp_mb();
+	if (!root->log_root)
+		return -ENOENT;
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (root->log_root) {
+		ret = 0;
+		atomic_inc(&root->fs_info->tree_log_writers);
+		root->fs_info->tree_log_batch++;
+	}
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+	atomic_dec(&root->fs_info->tree_log_writers);
+	smp_mb();
+	if (waitqueue_active(&root->fs_info->tree_log_wait))
+		wake_up(&root->fs_info->tree_log_wait);
+	return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+	/* should we free the extent on disk when done?  This is used
+	 * at transaction commit time while freeing a log tree
+	 */
+	int free;
+
+	/* should we write out the extent buffer?  This is used
+	 * while flushing the log tree to disk during a sync
+	 */
+	int write;
+
+	/* should we wait for the extent buffer io to finish?  Also used
+	 * while flushing the log tree to disk for a sync
+	 */
+	int wait;
+
+	/* pin only walk, we record which extents on disk belong to the
+	 * log trees
+	 */
+	int pin;
+
+	/* what stage of the replay code we're currently in */
+	int stage;
+
+	/* the root we are currently replaying */
+	struct btrfs_root *replay_dest;
+
+	/* the trans handle for the current replay */
+	struct btrfs_trans_handle *trans;
+
+	/* the function that gets used to process blocks we find in the
+	 * tree.  Note the extent_buffer might not be up to date when it is
+	 * passed in, and it must be checked or read if you need the data
+	 * inside it
+	 */
+	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+			      struct extent_buffer *eb,
+			      struct walk_control *wc, u64 gen)
+{
+	if (wc->pin) {
+		mutex_lock(&log->fs_info->alloc_mutex);
+		btrfs_update_pinned_extents(log->fs_info->extent_root,
+					    eb->start, eb->len, 1);
+		mutex_unlock(&log->fs_info->alloc_mutex);
+	}
+
+	if (btrfs_buffer_uptodate(eb, gen)) {
+		if (wc->write)
+			btrfs_write_tree_block(eb);
+		if (wc->wait)
+			btrfs_wait_tree_block_writeback(eb);
+	}
+	return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *eb, int slot,
+				   struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size;
+	u64 saved_i_size = 0;
+	int save_old_i_size = 0;
+	unsigned long src_ptr;
+	unsigned long dst_ptr;
+	int overwrite_root = 0;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		overwrite_root = 1;
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+	/* look for the key in the destination tree */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *src_copy;
+		char *dst_copy;
+		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+						  path->slots[0]);
+		if (dst_size != item_size)
+			goto insert;
+
+		if (item_size == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+		dst_copy = kmalloc(item_size, GFP_NOFS);
+		src_copy = kmalloc(item_size, GFP_NOFS);
+
+		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+				   item_size);
+		ret = memcmp(dst_copy, src_copy, item_size);
+
+		kfree(dst_copy);
+		kfree(src_copy);
+		/*
+		 * they have the same contents, just return, this saves
+		 * us from cowing blocks in the destination tree and doing
+		 * extra writes that may not have been done by a previous
+		 * sync
+		 */
+		if (ret == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+
+	}
+insert:
+	btrfs_release_path(root, path);
+	/* try to insert the key into the destination tree */
+	ret = btrfs_insert_empty_item(trans, root, path,
+				      key, item_size);
+
+	/* make sure any existing item is the correct size */
+	if (ret == -EEXIST) {
+		u32 found_size;
+		found_size = btrfs_item_size_nr(path->nodes[0],
+						path->slots[0]);
+		if (found_size > item_size) {
+			btrfs_truncate_item(trans, root, path, item_size, 1);
+		} else if (found_size < item_size) {
+			ret = btrfs_del_item(trans, root,
+					     path);
+			BUG_ON(ret);
+
+			btrfs_release_path(root, path);
+			ret = btrfs_insert_empty_item(trans,
+				  root, path, key, item_size);
+			BUG_ON(ret);
+		}
+	} else if (ret) {
+		BUG();
+	}
+	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+					path->slots[0]);
+
+	/* don't overwrite an existing inode if the generation number
+	 * was logged as zero.  This is done when the tree logging code
+	 * is just logging an inode to make sure it exists after recovery.
+	 *
+	 * Also, don't overwrite i_size on directories during replay.
+	 * log replay inserts and removes directory items based on the
+	 * state of the tree found in the subvolume, and i_size is modified
+	 * as it goes
+	 */
+	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+		struct btrfs_inode_item *src_item;
+		struct btrfs_inode_item *dst_item;
+
+		src_item = (struct btrfs_inode_item *)src_ptr;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+		if (btrfs_inode_generation(eb, src_item) == 0)
+			goto no_copy;
+
+		if (overwrite_root &&
+		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+			save_old_i_size = 1;
+			saved_i_size = btrfs_inode_size(path->nodes[0],
+							dst_item);
+		}
+	}
+
+	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+			   src_ptr, item_size);
+
+	if (save_old_i_size) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+	}
+
+	/* make sure the generation is filled in */
+	if (key->type == BTRFS_INODE_ITEM_KEY) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+			btrfs_set_inode_generation(path->nodes[0], dst_item,
+						   trans->transid);
+		}
+	}
+no_copy:
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+					     u64 objectid)
+{
+	struct inode *inode;
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int found_type;
+	u64 mask = root->sectorsize - 1;
+	u64 extent_end;
+	u64 alloc_hint;
+	u64 start = key->offset;
+	struct btrfs_file_extent_item *item;
+	struct inode *inode = NULL;
+	unsigned long size;
+	int ret = 0;
+
+	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(eb, item);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG)
+		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb,
+						    btrfs_item_nr(eb, slot));
+		extent_end = (start + size + mask) & ~mask;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * first check to see if we already have this extent in the
+	 * file.  This must be done before the btrfs_drop_extents run
+	 * so we don't try to drop this extent.
+	 */
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       start, 0);
+
+	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
+		struct btrfs_file_extent_item cmp1;
+		struct btrfs_file_extent_item cmp2;
+		struct btrfs_file_extent_item *existing;
+		struct extent_buffer *leaf;
+
+		leaf = path->nodes[0];
+		existing = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_file_extent_item);
+
+		read_extent_buffer(eb, &cmp1, (unsigned long)item,
+				   sizeof(cmp1));
+		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+				   sizeof(cmp2));
+
+		/*
+		 * we already have a pointer to this exact extent,
+		 * we don't have to do anything
+		 */
+		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+			btrfs_release_path(root, path);
+			goto out;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* drop any overlapping extents */
+	ret = btrfs_drop_extents(trans, root, inode,
+			 start, extent_end, start, &alloc_hint);
+	BUG_ON(ret);
+
+	BUG_ON(ret);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		struct btrfs_key ins;
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		/* insert the extent pointer in the file */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+
+		/*
+		 * is this extent already allocated in the extent
+		 * allocation tree?  If so, just add a reference
+		 */
+		ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset);
+		btrfs_release_path(root, path);
+		if (ret == 0) {
+			ret = btrfs_inc_extent_ref(trans, root,
+				   ins.objectid, ins.offset,
+				   root->root_key.objectid,
+				   trans->transid, key->objectid, start);
+		} else {
+			/*
+			 * insert the extent pointer in the extent
+			 * allocation tree
+			 */
+			ret = btrfs_alloc_logged_extent(trans, root,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						start, &ins);
+			BUG_ON(ret);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
+	/* btrfs_drop_extents changes i_blocks, update it here */
+	inode->i_blocks += (extent_end - start) >> 9;
+	btrfs_update_inode(trans, root, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct inode *dir,
+				      struct btrfs_dir_item *di)
+{
+	struct inode *inode;
+	char *name;
+	int name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_key location;
+	int ret;
+
+	leaf = path->nodes[0];
+
+	btrfs_dir_item_key_to_cpu(leaf, di, &location);
+	name_len = btrfs_dir_name_len(leaf, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+	btrfs_release_path(root, path);
+
+	inode = read_one_inode(root, location.objectid);
+	BUG_ON(!inode);
+
+	btrfs_inc_nlink(inode);
+	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	kfree(name);
+
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 dirid, u64 objectid, u64 index,
+				 const char *name, int name_len)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_key location;
+	int match = 0;
+
+	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+					 index, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	match = 1;
+out:
+	btrfs_release_path(root, path);
+	return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+				   struct btrfs_key *key,
+				   char *name, int namelen)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	unsigned long name_ptr;
+	int found_name_len;
+	int item_size;
+	int ret;
+	int match = 0;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		ref = (struct btrfs_inode_ref *)ptr;
+		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+		if (found_name_len == namelen) {
+			name_ptr = (unsigned long)(ref + 1);
+			ret = memcmp_extent_buffer(path->nodes[0], name,
+						   name_ptr, namelen);
+			if (ret == 0) {
+				match = 1;
+				goto out;
+			}
+		}
+		ptr = (unsigned long)(ref + 1) + found_name_len;
+	}
+out:
+	btrfs_free_path(path);
+	return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir;
+	int ret;
+	struct btrfs_key location;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *inode;
+	char *name;
+	int namelen;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+
+	location.objectid = key->objectid;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, key->offset);
+	if (!dir)
+		return -ENOENT;
+
+	inode = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	namelen = btrfs_inode_ref_name_len(eb, ref);
+	name = kmalloc(namelen, GFP_NOFS);
+	BUG_ON(!name);
+
+	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+	/* if we already have a perfect match, we're done */
+	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+			 btrfs_inode_ref_index(eb, ref),
+			 name, namelen)) {
+		goto out;
+	}
+
+	/*
+	 * look for a conflicting back reference in the metadata.
+	 * if we find one we have to unlink that name of the file
+	 * before we add our new link.  Later on, we overwrite any
+	 * existing back reference, and we don't want to create
+	 * dangling pointers in the directory.
+	 */
+conflict_again:
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *victim_name;
+		int victim_name_len;
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+		struct extent_buffer *leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (key->objectid == key->offset)
+			goto out_nowrite;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while(ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			BUG_ON(!victim_name);
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log, key, victim_name,
+					    victim_name_len)) {
+				btrfs_inc_nlink(inode);
+				btrfs_release_path(root, path);
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+				kfree(victim_name);
+				btrfs_release_path(root, path);
+				goto conflict_again;
+			}
+			kfree(victim_name);
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+
+	/* look for a conflicting name */
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* insert our name */
+	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+			     btrfs_inode_ref_index(eb, ref));
+	BUG_ON(ret);
+
+	btrfs_update_inode(trans, root, inode);
+
+out:
+	ref_ptr = (unsigned long)(ref + 1) + namelen;
+	kfree(name);
+	if (ref_ptr < ref_end)
+		goto again;
+
+	/* finally write the back reference in the inode */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
+	BUG_ON(ret);
+
+out_nowrite:
+	btrfs_release_path(root, path);
+	iput(dir);
+	iput(inode);
+	return 0;
+}
+
+/*
+ * replay one csum item from the log tree into the subvolume 'root'
+ * eb, slot and key all refer to the log tree
+ * path is for temp use by this function and should be released on return
+ *
+ * This copies the checksums out of the log tree and inserts them into
+ * the subvolume.  Any existing checksums for this range in the file
+ * are overwritten, and new items are added where required.
+ *
+ * We keep this simple by reusing the btrfs_ordered_sum code from
+ * the data=ordered mode.  This basically means making a copy
+ * of all the checksums in ram, which we have to do anyway for kmap
+ * rules.
+ *
+ * The copy is then sent down to btrfs_csum_file_blocks, which
+ * does all the hard work of finding existing items in the file
+ * or adding new ones.
+ */
+static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 cur_offset;
+	unsigned long file_bytes;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct inode *inode;
+	unsigned long ptr;
+
+	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		return -EIO;
+	}
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
+	if (!sums) {
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&sums->list);
+	sums->len = file_bytes;
+	sums->file_offset = key->offset;
+
+	/*
+	 * copy all the sums into the ordered sum struct
+	 */
+	sector_sum = sums->sums;
+	cur_offset = key->offset;
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	while(item_size > 0) {
+		sector_sum->offset = cur_offset;
+		read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
+		sector_sum++;
+		item_size -= BTRFS_CRC32_SIZE;
+		ptr += BTRFS_CRC32_SIZE;
+		cur_offset += root->sectorsize;
+	}
+
+	/* let btrfs_csum_file_blocks add them into the file */
+	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
+	BUG_ON(ret);
+	kfree(sums);
+	iput(inode);
+
+	return 0;
+}
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	u64 nlink = 0;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	int name_len;
+
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key,
+				      path->slots[0]);
+		if (key.objectid != inode->i_ino ||
+		    key.type != BTRFS_INODE_REF_KEY)
+			break;
+		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+						   path->slots[0]);
+		while(ptr < ptr_end) {
+			struct btrfs_inode_ref *ref;
+
+			ref = (struct btrfs_inode_ref *)ptr;
+			name_len = btrfs_inode_ref_name_len(path->nodes[0],
+							    ref);
+			ptr = (unsigned long)(ref + 1) + name_len;
+			nlink++;
+		}
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+		btrfs_release_path(root, path);
+	}
+	btrfs_free_path(path);
+	if (nlink != inode->i_nlink) {
+		inode->i_nlink = nlink;
+		btrfs_update_inode(trans, root, inode);
+	}
+
+	return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+
+		if (ret == 1) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+
+		btrfs_release_path(root, path);
+		inode = read_one_inode(root, key.offset);
+		BUG_ON(!inode);
+
+		ret = fixup_inode_link_count(trans, root, inode);
+		BUG_ON(ret);
+
+		iput(inode);
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+	}
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct inode *inode;
+
+	inode = read_one_inode(root, objectid);
+	BUG_ON(!inode);
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_release_path(root, path);
+	if (ret == 0) {
+		btrfs_inc_nlink(inode);
+		btrfs_update_inode(trans, root, inode);
+	} else if (ret == -EEXIST) {
+		ret = 0;
+	} else {
+		BUG();
+	}
+	iput(inode);
+
+	return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    u64 dirid, u64 index,
+				    char *name, int name_len, u8 type,
+				    struct btrfs_key *location)
+{
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+
+	inode = read_one_inode(root, location->objectid);
+	if (!inode)
+		return -ENOENT;
+
+	dir = read_one_inode(root, dirid);
+	if (!dir) {
+		iput(inode);
+		return -EIO;
+	}
+	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+	/* FIXME, put inode into FIXUP list */
+
+	iput(inode);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *eb,
+				    struct btrfs_dir_item *di,
+				    struct btrfs_key *key)
+{
+	char *name;
+	int name_len;
+	struct btrfs_dir_item *dst_di;
+	struct btrfs_key found_key;
+	struct btrfs_key log_key;
+	struct inode *dir;
+	struct inode *inode;
+	u8 log_type;
+	int ret;
+
+	dir = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	log_type = btrfs_dir_type(eb, di);
+	read_extent_buffer(eb, name, (unsigned long)(di + 1),
+		   name_len);
+
+	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	if (key->type == BTRFS_DIR_ITEM_KEY) {
+		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+				       name, name_len, 1);
+	}
+	else if (key->type == BTRFS_DIR_INDEX_KEY) {
+		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						     key->objectid,
+						     key->offset, name,
+						     name_len, 1);
+	} else {
+		BUG();
+	}
+	if (!dst_di || IS_ERR(dst_di)) {
+		/* we need a sequence number to insert, so we only
+		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+		 */
+		if (key->type != BTRFS_DIR_INDEX_KEY)
+			goto out;
+		goto insert;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* the existing item matches the logged item */
+	if (found_key.objectid == log_key.objectid &&
+	    found_key.type == log_key.type &&
+	    found_key.offset == log_key.offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+		goto out;
+	}
+
+	/*
+	 * don't drop the conflicting directory entry if the inode
+	 * for the new entry doesn't exist
+	 */
+	inode = read_one_inode(root, log_key.objectid);
+	if (!inode)
+		goto out;
+
+	iput(inode);
+	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+	BUG_ON(ret);
+
+	if (key->type == BTRFS_DIR_INDEX_KEY)
+		goto insert;
+out:
+	btrfs_release_path(root, path);
+	kfree(name);
+	iput(dir);
+	return 0;
+
+insert:
+	btrfs_release_path(root, path);
+	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+			      name, name_len, log_type, &log_key);
+
+	if (ret && ret != -ENOENT)
+		BUG();
+	goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct extent_buffer *eb, int slot,
+					struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	struct btrfs_dir_item *di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while(ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		ret = replay_one_name(trans, root, path, eb, di, key);
+		BUG_ON(ret);
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	return 0;
+}
+
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   u64 dirid, int key_type,
+				   u64 *start_ret, u64 *end_ret)
+{
+	struct btrfs_key key;
+	u64 found_end;
+	struct btrfs_dir_log_item *item;
+	int ret;
+	int nritems;
+
+	if (*start_ret == (u64)-1)
+		return 1;
+
+	key.objectid = dirid;
+	key.type = key_type;
+	key.offset = *start_ret;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+	if (ret != 0)
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto next;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+	if (*start_ret >= key.offset && *start_ret <= found_end) {
+		ret = 0;
+		*start_ret = key.offset;
+		*end_ret = found_end;
+		goto out;
+	}
+	ret = 1;
+next:
+	/* check the next slot in the tree to see if it is a valid item */
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (path->slots[0] >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	} else {
+		path->slots[0]++;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto out;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+	*start_ret = key.offset;
+	*end_ret = found_end;
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_root *log,
+				      struct btrfs_path *path,
+				      struct btrfs_path *log_path,
+				      struct inode *dir,
+				      struct btrfs_key *dir_key)
+{
+	int ret;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	struct btrfs_dir_item *di;
+	struct btrfs_dir_item *log_di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	char *name;
+	struct inode *inode;
+	struct btrfs_key location;
+
+again:
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while(ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		name = kmalloc(name_len, GFP_NOFS);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+				  name_len);
+		log_di = NULL;
+		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+						       dir_key->objectid,
+						       name, name_len, 0);
+		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+			log_di = btrfs_lookup_dir_index_item(trans, log,
+						     log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		}
+		if (!log_di || IS_ERR(log_di)) {
+			btrfs_dir_item_key_to_cpu(eb, di, &location);
+			btrfs_release_path(root, path);
+			btrfs_release_path(log, log_path);
+			inode = read_one_inode(root, location.objectid);
+			BUG_ON(!inode);
+
+			ret = link_to_fixup_dir(trans, root,
+						path, location.objectid);
+			BUG_ON(ret);
+			btrfs_inc_nlink(inode);
+			ret = btrfs_unlink_inode(trans, root, dir, inode,
+						 name, name_len);
+			BUG_ON(ret);
+			kfree(name);
+			iput(inode);
+
+			/* there might still be more names under this key
+			 * check and repeat if required
+			 */
+			ret = btrfs_search_slot(NULL, root, dir_key, path,
+						0, 0);
+			if (ret == 0)
+				goto again;
+			ret = 0;
+			goto out;
+		}
+		btrfs_release_path(log, log_path);
+		kfree(name);
+
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, log_path);
+	return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid)
+{
+	u64 range_start;
+	u64 range_end;
+	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+	int ret = 0;
+	struct btrfs_key dir_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *log_path;
+	struct inode *dir;
+
+	dir_key.objectid = dirid;
+	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	dir = read_one_inode(root, dirid);
+	/* it isn't an error if the inode isn't there, that can happen
+	 * because we replay the deletes before we copy in the inode item
+	 * from the log
+	 */
+	if (!dir) {
+		btrfs_free_path(log_path);
+		return 0;
+	}
+again:
+	range_start = 0;
+	range_end = 0;
+	while(1) {
+		ret = find_dir_range(log, path, dirid, key_type,
+				     &range_start, &range_end);
+		if (ret != 0)
+			break;
+
+		dir_key.offset = range_start;
+		while(1) {
+			int nritems;
+			ret = btrfs_search_slot(NULL, root, &dir_key, path,
+						0, 0);
+			if (ret < 0)
+				goto out;
+
+			nritems = btrfs_header_nritems(path->nodes[0]);
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+			if (found_key.objectid != dirid ||
+			    found_key.type != dir_key.type)
+				goto next_type;
+
+			if (found_key.offset > range_end)
+				break;
+
+			ret = check_item_in_log(trans, root, log, path,
+						log_path, dir, &found_key);
+			BUG_ON(ret);
+			if (found_key.offset == (u64)-1)
+				break;
+			dir_key.offset = found_key.offset + 1;
+		}
+		btrfs_release_path(root, path);
+		if (range_end == (u64)-1)
+			break;
+		range_start = range_end + 1;
+	}
+
+next_type:
+	ret = 0;
+	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+		dir_key.type = BTRFS_DIR_INDEX_KEY;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(log_path);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+			     struct walk_control *wc, u64 gen)
+{
+	int nritems;
+	struct btrfs_path *path;
+	struct btrfs_root *root = wc->replay_dest;
+	struct btrfs_key key;
+	u32 item_size;
+	int level;
+	int i;
+	int ret;
+
+	btrfs_read_buffer(eb, gen);
+
+	level = btrfs_header_level(eb);
+
+	if (level != 0)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		item_size = btrfs_item_size_nr(eb, i);
+
+		/* inode keys are done during the first stage */
+		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_INODES) {
+			struct inode *inode;
+			struct btrfs_inode_item *inode_item;
+			u32 mode;
+
+			inode_item = btrfs_item_ptr(eb, i,
+					    struct btrfs_inode_item);
+			mode = btrfs_inode_mode(eb, inode_item);
+			if (S_ISDIR(mode)) {
+				ret = replay_dir_deletes(wc->trans,
+					 root, log, path, key.objectid);
+				BUG_ON(ret);
+			}
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+
+			/* for regular files, truncate away
+			 * extents past the new EOF
+			 */
+			if (S_ISREG(mode)) {
+				inode = read_one_inode(root,
+						       key.objectid);
+				BUG_ON(!inode);
+
+				ret = btrfs_truncate_inode_items(wc->trans,
+					root, inode, inode->i_size,
+					BTRFS_EXTENT_DATA_KEY);
+				BUG_ON(ret);
+				iput(inode);
+			}
+			ret = link_to_fixup_dir(wc->trans, root,
+						path, key.objectid);
+			BUG_ON(ret);
+		}
+		if (wc->stage < LOG_WALK_REPLAY_ALL)
+			continue;
+
+		/* these keys are simply copied */
+		if (key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc->trans, root, path,
+						eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
+			ret = replay_one_csum(wc->trans, root, path,
+					      eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
+			   key.type == BTRFS_DIR_INDEX_KEY) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			BUG_ON(ret);
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level,
+				   struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u32 blocksize;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while(*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (btrfs_header_level(cur) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+
+		parent = path->nodes[*level];
+		root_owner = btrfs_header_owner(parent);
+		root_gen = btrfs_header_generation(parent);
+
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+
+		wc->process_func(root, next, wc, ptr_gen);
+
+		if (*level == 1) {
+			path->slots[*level]++;
+			if (wc->free) {
+				btrfs_read_buffer(next, ptr_gen);
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				ret = btrfs_drop_leaf_ref(trans, root, next);
+				BUG_ON(ret);
+
+				WARN_ON(root_owner !=
+					BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_extent(trans, root, bytenr,
+							blocksize, root_owner,
+							root_gen, 0, 0, 1);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(next);
+			continue;
+		}
+		btrfs_read_buffer(next, ptr_gen);
+
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	if (path->nodes[*level] == root->node) {
+		parent = path->nodes[*level];
+	} else {
+		parent = path->nodes[*level + 1];
+	}
+	bytenr = path->nodes[*level]->start;
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
+	root_gen = btrfs_header_generation(parent);
+
+	wc->process_func(root, path->nodes[*level], wc,
+			 btrfs_header_generation(path->nodes[*level]));
+
+	if (wc->free) {
+		next = path->nodes[*level];
+		btrfs_tree_lock(next);
+		clean_tree_block(trans, root, next);
+		btrfs_wait_tree_block_writeback(next);
+		btrfs_tree_unlock(next);
+
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, next);
+			BUG_ON(ret);
+		}
+		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					  root_owner, root_gen, 0, 0, 1);
+		BUG_ON(ret);
+	}
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+
+	cond_resched();
+	return 0;
+}
+
+static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level,
+				 struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	int i;
+	int slot;
+	int ret;
+
+	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			node = path->nodes[i];
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			return 0;
+		} else {
+			if (path->nodes[*level] == root->node) {
+				root_owner = root->root_key.objectid;
+				root_gen =
+				   btrfs_header_generation(path->nodes[*level]);
+			} else {
+				struct extent_buffer *node;
+				node = path->nodes[*level + 1];
+				root_owner = btrfs_header_owner(node);
+				root_gen = btrfs_header_generation(node);
+			}
+			wc->process_func(root, path->nodes[*level], wc,
+				 btrfs_header_generation(path->nodes[*level]));
+			if (wc->free) {
+				struct extent_buffer *next;
+
+				next = path->nodes[*level];
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				if (*level == 0) {
+					ret = btrfs_drop_leaf_ref(trans, root,
+								  next);
+					BUG_ON(ret);
+				}
+
+				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_extent(trans, root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len,
+						root_owner, root_gen, 0, 0, 1);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *log, struct walk_control *wc)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int i;
+	int orig_level;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	level = btrfs_header_level(log->node);
+	orig_level = level;
+	path->nodes[level] = log->node;
+	extent_buffer_get(log->node);
+	path->slots[level] = 0;
+
+	while(1) {
+		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+	}
+
+	/* was the root node processed? if not, catch it here */
+	if (path->nodes[orig_level]) {
+		wc->process_func(log, path->nodes[orig_level], wc,
+			 btrfs_header_generation(path->nodes[orig_level]));
+		if (wc->free) {
+			struct extent_buffer *next;
+
+			next = path->nodes[orig_level];
+
+			btrfs_tree_lock(next);
+			clean_tree_block(trans, log, next);
+			btrfs_wait_tree_block_writeback(next);
+			btrfs_tree_unlock(next);
+
+			if (orig_level == 0) {
+				ret = btrfs_drop_leaf_ref(trans, log,
+							  next);
+				BUG_ON(ret);
+			}
+			WARN_ON(log->root_key.objectid !=
+				BTRFS_TREE_LOG_OBJECTID);
+			ret = btrfs_free_extent(trans, log,
+						next->start, next->len,
+						log->root_key.objectid,
+						btrfs_header_generation(next),
+						0, 0, 1);
+			BUG_ON(ret);
+		}
+	}
+
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
+	btrfs_free_path(path);
+	if (wc->free)
+		free_extent_buffer(log->node);
+	return ret;
+}
+
+int wait_log_commit(struct btrfs_root *log)
+{
+	DEFINE_WAIT(wait);
+	u64 transid = log->fs_info->tree_log_transid;
+
+	do {
+		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		if (atomic_read(&log->fs_info->tree_log_commit))
+			schedule();
+		finish_wait(&log->fs_info->tree_log_wait, &wait);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+	} while(transid == log->fs_info->tree_log_transid &&
+		atomic_read(&log->fs_info->tree_log_commit));
+	return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root)
+{
+	int ret;
+	unsigned long batch;
+	struct btrfs_root *log = root->log_root;
+	struct walk_control wc = {
+		.write = 1,
+		.process_func = process_one_buffer
+	};
+
+	mutex_lock(&log->fs_info->tree_log_mutex);
+	if (atomic_read(&log->fs_info->tree_log_commit)) {
+		wait_log_commit(log);
+		goto out;
+	}
+	atomic_set(&log->fs_info->tree_log_commit, 1);
+
+	while(1) {
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		schedule_timeout_uninterruptible(1);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+		batch = log->fs_info->tree_log_batch;
+
+		while(atomic_read(&log->fs_info->tree_log_writers)) {
+			DEFINE_WAIT(wait);
+			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			batch = log->fs_info->tree_log_batch;
+			mutex_unlock(&log->fs_info->tree_log_mutex);
+			if (atomic_read(&log->fs_info->tree_log_writers))
+				schedule();
+			mutex_lock(&log->fs_info->tree_log_mutex);
+			finish_wait(&log->fs_info->tree_log_wait, &wait);
+		}
+		if (batch == log->fs_info->tree_log_batch)
+			break;
+	}
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	BUG_ON(ret);
+
+	wc.wait = 1;
+
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	BUG_ON(ret);
+
+	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+				 log->fs_info->log_root_tree->node->start);
+	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+		       btrfs_header_level(log->fs_info->log_root_tree->node));
+
+	write_ctree_super(trans, log->fs_info->tree_root);
+	log->fs_info->tree_log_transid++;
+	log->fs_info->tree_log_batch = 0;
+	atomic_set(&log->fs_info->tree_log_commit, 0);
+	smp_mb();
+	if (waitqueue_active(&log->fs_info->tree_log_wait))
+		wake_up(&log->fs_info->tree_log_wait);
+out:
+	mutex_unlock(&log->fs_info->tree_log_mutex);
+	return 0;
+
+}
+
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_root *log;
+	struct key;
+	struct walk_control wc = {
+		.free = 1,
+		.process_func = process_one_buffer
+	};
+
+	if (!root->log_root)
+		return 0;
+
+	log = root->log_root;
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	log = root->log_root;
+	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+			     &log->root_key);
+	BUG_ON(ret);
+	root->log_root = NULL;
+	kfree(root->log_root);
+	return 0;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	u64 bytenr = btrfs_root_bytenr(&log->root_item);
+	int ret;
+
+	if (log->node->start == bytenr)
+		return 0;
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	BUG_ON(ret);
+	return ret;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index)
+{
+	struct btrfs_root *log;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	int ret;
+	int bytes_del = 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+	log = root->log_root;
+	path = btrfs_alloc_path();
+	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+				   name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+	btrfs_release_path(log, path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+					 index, name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+
+	/* update the directory size in the log to reflect the names
+	 * we have removed
+	 */
+	if (bytes_del) {
+		struct btrfs_key key;
+
+		key.objectid = dir->i_ino;
+		key.offset = 0;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		btrfs_release_path(log, path);
+
+		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret == 0) {
+			struct btrfs_inode_item *item;
+			u64 i_size;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			i_size = btrfs_inode_size(path->nodes[0], item);
+			if (i_size > bytes_del)
+				i_size -= bytes_del;
+			else
+				i_size = 0;
+			btrfs_set_inode_size(path->nodes[0], item, i_size);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+		} else
+			ret = 0;
+		btrfs_release_path(log, path);
+	}
+
+	btrfs_free_path(path);
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	end_log_trans(root);
+
+	return 0;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid)
+{
+	struct btrfs_root *log;
+	u64 index;
+	int ret;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+	log = root->log_root;
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+				  dirid, &index);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	end_log_trans(root);
+
+	if (ret == 0 || ret == -ENOENT)
+		return 0;
+	return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       int key_type, u64 dirid,
+				       u64 first_offset, u64 last_offset)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_log_item *item;
+
+	key.objectid = dirid;
+	key.offset = first_offset;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+	else
+		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+	BUG_ON(ret);
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path, int key_type,
+			  u64 min_offset, u64 *last_offset_ret)
+{
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src;
+	int ret;
+	int i;
+	int nritems;
+	u64 first_offset = min_offset;
+	u64 last_offset = (u64)-1;
+
+	log = root->log_root;
+	max_key.objectid = inode->i_ino;
+	max_key.offset = (u64)-1;
+	max_key.type = key_type;
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = key_type;
+	min_key.offset = min_offset;
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &min_key, &max_key,
+				   path, 0, trans->transid);
+
+	/*
+	 * we didn't find anything from this transaction, see if there
+	 * is anything at all
+	 */
+	if (ret != 0 || min_key.objectid != inode->i_ino ||
+	    min_key.type != key_type) {
+		min_key.objectid = inode->i_ino;
+		min_key.type = key_type;
+		min_key.offset = (u64)-1;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_release_path(root, path);
+			return ret;
+		}
+		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+
+		/* if ret == 0 there are items for this type,
+		 * create a range to tell us the last key of this type.
+		 * otherwise, there are no items in this directory after
+		 * *min_offset, and we create a range to indicate that.
+		 */
+		if (ret == 0) {
+			struct btrfs_key tmp;
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+					      path->slots[0]);
+			if (key_type == tmp.type) {
+				first_offset = max(min_offset, tmp.offset) + 1;
+			}
+		}
+		goto done;
+	}
+
+	/* go backward to find any previous key */
+	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+	if (ret == 0) {
+		struct btrfs_key tmp;
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (key_type == tmp.type) {
+			first_offset = tmp.offset;
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* find the first key from this transaction again */
+	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret != 0) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	/*
+	 * we have a block from this transaction, log every item in it
+	 * from our directory
+	 */
+	while(1) {
+		struct btrfs_key tmp;
+		src = path->nodes[0];
+		nritems = btrfs_header_nritems(src);
+		for (i = path->slots[0]; i < nritems; i++) {
+			btrfs_item_key_to_cpu(src, &min_key, i);
+
+			if (min_key.objectid != inode->i_ino ||
+			    min_key.type != key_type)
+				goto done;
+			ret = overwrite_item(trans, log, dst_path, src, i,
+					     &min_key);
+			BUG_ON(ret);
+		}
+		path->slots[0] = nritems;
+
+		/*
+		 * look ahead to the next item and see if it is also
+		 * from this directory and from this transaction
+		 */
+		ret = btrfs_next_leaf(root, path);
+		if (ret == 1) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+
+			BUG_ON(ret);
+			last_offset = tmp.offset;
+			goto done;
+		}
+	}
+done:
+	*last_offset_ret = last_offset;
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, dst_path);
+
+	/* insert the log range keys to indicate where the log is valid */
+	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+				 first_offset, last_offset);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path)
+{
+	u64 min_key;
+	u64 max_key;
+	int ret;
+	int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+	min_key = 0;
+	max_key = 0;
+	while(1) {
+		ret = log_dir_items(trans, root, inode, path,
+				    dst_path, key_type, min_key,
+				    &max_key);
+		BUG_ON(ret);
+		if (max_key == (u64)-1)
+			break;
+		min_key = max_key + 1;
+	}
+
+	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		key_type = BTRFS_DIR_INDEX_KEY;
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  u64 objectid, int max_key_type)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = objectid;
+	key.type = max_key_type;
+	key.offset = (u64)-1;
+
+	while(1) {
+		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+
+		if (ret != 1)
+			break;
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+
+		if (found_key.objectid != objectid)
+			break;
+
+		ret = btrfs_del_item(trans, log, path);
+		BUG_ON(ret);
+		btrfs_release_path(log, path);
+	}
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only)
+{
+	struct btrfs_path *path;
+	struct btrfs_path *dst_path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct extent_buffer *src;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	u32 size;
+	int ret;
+
+	log = root->log_root;
+
+	path = btrfs_alloc_path();
+	dst_path = btrfs_alloc_path();
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = BTRFS_INODE_ITEM_KEY;
+	min_key.offset = 0;
+
+	max_key.objectid = inode->i_ino;
+	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+		max_key.type = BTRFS_XATTR_ITEM_KEY;
+	else
+		max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	/*
+	 * if this inode has already been logged and we're in inode_only
+	 * mode, we don't want to delete the things that have already
+	 * been written to the log.
+	 *
+	 * But, if the inode has been through an inode_only log,
+	 * the logged_trans field is not set.  This allows us to catch
+	 * any new names for this inode in the backrefs by logging it
+	 * again
+	 */
+	if (inode_only == LOG_INODE_EXISTS &&
+	    BTRFS_I(inode)->logged_trans == trans->transid) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		goto out;
+	}
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	/*
+	 * a brute force approach to making sure we get the most uptodate
+	 * copies of everything.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
+		ret = drop_objectid_items(trans, log, path,
+					  inode->i_ino, max_key_type);
+	} else {
+		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+	}
+	BUG_ON(ret);
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &min_key, &max_key,
+					   path, 0, trans->transid);
+		if (ret != 0)
+			break;
+
+		if (min_key.objectid != inode->i_ino)
+			break;
+		if (min_key.type > max_key.type)
+			break;
+
+		src = path->nodes[0];
+		size = btrfs_item_size_nr(src, path->slots[0]);
+		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
+					      size);
+		if (ret)
+			BUG();
+
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, path->slots[0]);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, size);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    min_key.type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, path->slots[0],
+						struct btrfs_file_extent_item);
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   log->root_key.objectid,
+						   0,
+						   inode->i_ino,
+						   min_key.offset);
+					BUG_ON(ret);
+				}
+			}
+		}
+
+		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+
+		if (min_key.offset < (u64)-1)
+			min_key.offset++;
+		else if (min_key.type < (u8)-1)
+			min_key.type++;
+		else if (min_key.objectid < (u64)-1)
+			min_key.objectid++;
+		else
+			break;
+	}
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+		ret = log_directory_changes(trans, root, inode, path, dst_path);
+		BUG_ON(ret);
+	}
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_free_path(path);
+	btrfs_free_path(dst_path);
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+	return 0;
+}
+
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only)
+{
+	int ret;
+
+	start_log_trans(trans, root);
+	ret = __btrfs_log_inode(trans, root, inode, inode_only);
+	end_log_trans(root);
+	return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry)
+{
+	int inode_only = LOG_INODE_ALL;
+	struct super_block *sb;
+	int ret;
+
+	start_log_trans(trans, root);
+	sb = dentry->d_inode->i_sb;
+	while(1) {
+		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+					inode_only);
+		BUG_ON(ret);
+		inode_only = LOG_INODE_EXISTS;
+
+		dentry = dentry->d_parent;
+		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+			break;
+
+		if (BTRFS_I(dentry->d_inode)->generation <=
+		    root->fs_info->last_trans_committed)
+			break;
+	}
+	end_log_trans(root);
+	return 0;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry)
+{
+	u64 gen;
+	gen = root->fs_info->last_trans_new_blockgroup;
+	if (gen > root->fs_info->last_trans_committed)
+		return 1;
+	else
+		return btrfs_log_dentry(trans, root, dentry);
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key tmp_key;
+	struct btrfs_root *log;
+	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	struct walk_control wc = {
+		.process_func = process_one_buffer,
+		.stage = 0,
+	};
+
+	fs_info->log_root_recovering = 1;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+	wc.trans = trans;
+	wc.pin = 1;
+
+	walk_log_tree(trans, log_root_tree, &wc);
+
+again:
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = (u64)-1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		btrfs_release_path(log_root_tree, path);
+		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			break;
+
+		log = btrfs_read_fs_root_no_radix(log_root_tree,
+						  &found_key);
+		BUG_ON(!log);
+
+
+		tmp_key.objectid = found_key.offset;
+		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_key.offset = (u64)-1;
+
+		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+
+		BUG_ON(!wc.replay_dest);
+
+		btrfs_record_root_in_trans(wc.replay_dest);
+		ret = walk_log_tree(trans, log, &wc);
+		BUG_ON(ret);
+
+		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+			ret = fixup_inode_link_counts(trans, wc.replay_dest,
+						      path);
+			BUG_ON(ret);
+		}
+
+		key.offset = found_key.offset - 1;
+		free_extent_buffer(log->node);
+		kfree(log);
+
+		if (found_key.offset == 0)
+			break;
+	}
+	btrfs_release_path(log_root_tree, path);
+
+	/* step one is to pin it all, step two is to replay just inodes */
+	if (wc.pin) {
+		wc.pin = 0;
+		wc.process_func = replay_one_buffer;
+		wc.stage = LOG_WALK_REPLAY_INODES;
+		goto again;
+	}
+	/* step three is to replay everything */
+	if (wc.stage < LOG_WALK_REPLAY_ALL) {
+		wc.stage++;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+
+	free_extent_buffer(log_root_tree->node);
+	log_root_tree->log_root = NULL;
+	fs_info->log_root_recovering = 0;
+
+	/* step 4: commit the transaction, which also unpins the blocks */
+	btrfs_commit_transaction(trans, fs_info->tree_root);
+
+	kfree(log_root_tree);
+	return 0;
+}
-- 
cgit v1.2.3-70-g09d2


From 4bef084857ab8fe71cf49eae349c25e440a49150 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Sep 2008 11:18:08 -0400
Subject: Btrfs: Tree logging fixes

* Pin down data blocks to prevent them from being reallocated like so:

trans 1: allocate file extent
trans 2: free file extent
trans 3: free file extent during old snapshot deletion
trans 3: allocate file extent to new file
trans 3: fsync new file

Before the tree logging code, this was legal because the fsync
would commit the transation that did the final data extent free
and the transaction that allocated the extent to the new file
at the same time.

With the tree logging code, the tree log subtransaction can commit
before the transaction that freed the extent.  If we crash,
we're left with two different files using the extent.

* Don't wait in start_transaction if log replay is going on.  This
avoids deadlocks from iput while we're cleaning up link counts in the
replay code.

* Don't deadlock in replay_one_name by trying to read an inode off
the disk while holding paths for the directory

* Hold the buffer lock while we mark a buffer as written.  This
closes a race where someone is changing a buffer while we write it.
They are supposed to mark it dirty again after they change it, but
this violates the cow rules.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 33 ++++++++++++++++++++++++++++++---
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/extent-tree.c | 20 +++++++++++++++++---
 fs/btrfs/extent_io.c   | 47 ++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/extent_io.h   |  1 +
 fs/btrfs/transaction.c | 16 +++++++++++++---
 fs/btrfs/tree-log.c    | 13 +++++++++----
 7 files changed, 107 insertions(+), 24 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a4373db5967..42bf9916805 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -307,9 +307,7 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
-	spin_lock(&root->fs_info->hash_lock);
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-	spin_unlock(&root->fs_info->hash_lock);
+
 	csum_tree_block(root, eb, 0);
 err:
 	free_extent_buffer(eb);
@@ -1998,7 +1996,36 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return ret;
 }
 
+int btree_lock_page_hook(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_buffer *eb;
+	unsigned long len;
+	u64 bytenr = page_offset(page);
+
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+
+	len = page->private >> 2;
+	eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+	if (!eb)
+		goto out;
+
+	btrfs_tree_lock(eb);
+	spin_lock(&root->fs_info->hash_lock);
+	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+	spin_unlock(&root->fs_info->hash_lock);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+out:
+	lock_page(page);
+	return 0;
+}
+
 static struct extent_io_ops btree_extent_io_ops = {
+	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.writepage_io_hook = btree_writepage_io_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 6b6fdc697f3..f84f5058dbb 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -80,4 +80,5 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 646b9148ca2..3181759da1c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1590,13 +1590,17 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 }
 
 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
-			  int pending)
+			  int is_data, int pending)
 {
 	int err = 0;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (!pending) {
 		struct extent_buffer *buf;
+
+		if (is_data)
+			goto pinit;
+
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
 			/* we can reuse a block if it hasn't been written
@@ -1624,6 +1628,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			}
 			free_extent_buffer(buf);
 		}
+pinit:
 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
@@ -1744,7 +1749,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 #endif
 
 		if (pin) {
-			ret = pin_down_bytes(root, bytenr, num_bytes, 0);
+			ret = pin_down_bytes(root, bytenr, num_bytes,
+			     owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 0);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1862,9 +1868,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		ref_generation = 0;
 
 	if (root == extent_root) {
-		pin_down_bytes(root, bytenr, num_bytes, 1);
+		pin_down_bytes(root, bytenr, num_bytes, 0, 1);
 		return 0;
 	}
+	/* if metadata always pin */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		pin = 1;
+
+	/* if data pin when any transaction has committed this */
+	if (ref_generation != trans->transid)
+		pin = 1;
+
 	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
 			    ref_generation, owner_objectid, owner_offset,
 			    pin, pin == 0);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 239e7c908ab..319a0c7a4a5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -29,7 +29,10 @@ static struct kmem_cache *extent_buffer_cache;
 
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
+
+#ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
+#endif
 
 #define BUFFER_LRU_MAX 64
 
@@ -106,7 +109,9 @@ EXPORT_SYMBOL(extent_io_tree_init);
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
+#ifdef LEAK_DEBUG
 	unsigned long flags;
+#endif
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state)
@@ -114,10 +119,11 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
+#ifdef LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&state->leak_list, &states);
 	spin_unlock_irqrestore(&leak_lock, flags);
-
+#endif
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	return state;
@@ -129,11 +135,15 @@ void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
 		unsigned long flags;
+#endif
 		WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
 		spin_lock_irqsave(&leak_lock, flags);
 		list_del(&state->leak_list);
 		spin_unlock_irqrestore(&leak_lock, flags);
+#endif
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -2070,13 +2080,13 @@ done:
 }
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-/* Taken directly from 2.6.23 for 2.6.18 back port */
+/* Taken directly from 2.6.23 with a mod for a lockpage hook */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
+#endif
 
 /**
- * write_cache_pages - walk the list of dirty pages of the given address space
- * and write all of them.
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @writepage: function called for each page
@@ -2090,9 +2100,10 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
-static int write_cache_pages(struct address_space *mapping,
-		      struct writeback_control *wbc, writepage_t writepage,
-		      void *data)
+int extent_write_cache_pages(struct extent_io_tree *tree,
+			     struct address_space *mapping,
+			     struct writeback_control *wbc,
+			     writepage_t writepage, void *data)
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
@@ -2138,7 +2149,10 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			lock_page(page);
+			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+				tree->ops->write_cache_pages_lock_hook(page);
+			else
+				lock_page(page);
 
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
@@ -2187,9 +2201,12 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+
+	if (wbc->range_cont)
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
-#endif
+EXPORT_SYMBOL(extent_write_cache_pages);
 
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
@@ -2214,7 +2231,8 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	ret = __extent_writepage(page, wbc, &epd);
 
-	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
+	extent_write_cache_pages(tree, mapping, &wbc_writepages,
+				 __extent_writepage, &epd);
 	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio, 0);
 	}
@@ -2235,7 +2253,8 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 	};
 
-	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+	ret = extent_write_cache_pages(tree, mapping, wbc,
+				       __extent_writepage, &epd);
 	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio, 0);
 	}
@@ -2567,15 +2586,19 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
 	unsigned long flags;
+#endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	eb->start = start;
 	eb->len = len;
 	mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
 	atomic_set(&eb->refs, 1);
 
 	return eb;
@@ -2583,10 +2606,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
+#ifdef LEAK_DEBUG
 	unsigned long flags;
 	spin_lock_irqsave(&leak_lock, flags);
 	list_del(&eb->leak_list);
 	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 315cfceae31..3cb411a5f4d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -50,6 +50,7 @@ struct extent_io_ops {
 			    unsigned long old, unsigned long bits);
 	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
+	int (*write_cache_pages_lock_hook)(struct page *page);
 };
 
 struct extent_io_tree {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 49c4f5b40ed..61a377bcb2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -161,7 +161,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	int ret;
 
 	mutex_lock(&root->fs_info->trans_mutex);
-	if ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)
+	if (!root->fs_info->log_root_recovering &&
+	    ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
 		wait_current_trans(root);
 	ret = join_transaction(root);
 	BUG_ON(ret);
@@ -328,9 +329,17 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_lock_page(btree_inode->i_mapping, index);
+			page = find_get_page(btree_inode->i_mapping, index);
 			if (!page)
 				continue;
+
+			btree_lock_page_hook(page);
+			if (!page->mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				continue;
+			}
+
 			if (PageWriteback(page)) {
 				if (PageDirty(page))
 					wait_on_page_writeback(page);
@@ -360,7 +369,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 			if (!page)
 				continue;
 			if (PageDirty(page)) {
-				lock_page(page);
+				btree_lock_page_hook(page);
+				wait_on_page_writeback(page);
 				err = write_one_page(page, 0);
 				if (err)
 					werr = err;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d1ce8314b94..13d7ee8e0c5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1176,8 +1176,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	struct btrfs_key found_key;
 	struct btrfs_key log_key;
 	struct inode *dir;
-	struct inode *inode;
 	u8 log_type;
+	int exists;
 	int ret;
 
 	dir = read_one_inode(root, key->objectid);
@@ -1190,6 +1190,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		   name_len);
 
 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+	if (exists == 0)
+		exists = 1;
+	else
+		exists = 0;
+	btrfs_release_path(root, path);
+
 	if (key->type == BTRFS_DIR_ITEM_KEY) {
 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
 				       name, name_len, 1);
@@ -1224,11 +1231,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	 * don't drop the conflicting directory entry if the inode
 	 * for the new entry doesn't exist
 	 */
-	inode = read_one_inode(root, log_key.objectid);
-	if (!inode)
+	if (!exists)
 		goto out;
 
-	iput(inode);
 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
 	BUG_ON(ret);
 
-- 
cgit v1.2.3-70-g09d2


From 8d5bf1cb35ea29795862ff0ea2f4c4d7e22727f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:51:21 -0400
Subject: Btrfs: Update the highest objectid in a root after log replay is done

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c    | 3 ++-
 fs/btrfs/tree-log.c | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b4dbb5c9764..073fdd77c78 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2239,8 +2239,9 @@ static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
 
 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
 		ret = btrfs_set_inode_index_count(dir);
-		if (ret)
+		if (ret) {
 			return ret;
+		}
 	}
 
 	*index = BTRFS_I(dir)->index_cnt;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 13d7ee8e0c5..3f4b139b27e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1027,6 +1027,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		inode->i_nlink = nlink;
 		btrfs_update_inode(trans, root, inode);
 	}
+	BTRFS_I(inode)->index_cnt = (u64)-1;
 
 	return 0;
 }
@@ -2714,6 +2715,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	struct btrfs_key tmp_key;
 	struct btrfs_root *log;
 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	u64 highest_inode;
 	struct walk_control wc = {
 		.process_func = process_one_buffer,
 		.stage = 0,
@@ -2772,6 +2774,11 @@ again:
 						      path);
 			BUG_ON(ret);
 		}
+		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+		if (ret == 0) {
+			wc.replay_dest->highest_inode = highest_inode;
+			wc.replay_dest->last_inode_alloc = highest_inode;
+		}
 
 		key.offset = found_key.offset - 1;
 		free_extent_buffer(log->node);
-- 
cgit v1.2.3-70-g09d2


From 49eb7e46d47ea72a9bd2a5f8cedb04f5159cc277 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:53:12 -0400
Subject: Btrfs: Dir fsync optimizations

Drop i_mutex during the commit

Don't bother doing the fsync at all unless the dir is marked as dirtied
and needing fsync in this transaction.  For directories, this means
that someone has unlinked a file from the dir without fsyncing the
file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  3 +++
 fs/btrfs/file.c        | 19 ++++++++++++++++++-
 fs/btrfs/inode.c       |  5 ++++-
 fs/btrfs/tree-log.c    |  8 ++++----
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fcc8cf27e90..0577fda2168 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -56,6 +56,9 @@ struct btrfs_inode {
 	 * transid that last logged this inode
 	 */
 	u64 logged_trans;
+
+	/* trans that last made a change that should be fully fsync'd */
+	u64 log_dirty_trans;
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 84ecf3ab851..58b329ddb42 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1061,7 +1061,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	root->fs_info->tree_log_batch++;
 	filemap_fdatawait(inode->i_mapping);
+	root->fs_info->tree_log_batch++;
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
@@ -1076,14 +1078,29 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-	if (ret < 0)
+	if (ret < 0) {
 		goto out;
+	}
+
+	/* we've logged all the items and now have a consistent
+	 * version of the file in the log.  It is possible that
+	 * someone will come in and modify the file, but that's
+	 * fine because the log is consistent on disk, and we
+	 * have references to all of the file's extents
+	 *
+	 * It is possible that someone will come in and log the
+	 * file again, but that will end up using the synchronization
+	 * inside btrfs_sync_log to keep things safe.
+	 */
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 24b7e97fccb..12c1c0530f3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1187,7 +1187,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir->i_ino);
-	BUG_ON(ret);
+	BUG_ON(ret != 0 && ret != -ENOENT);
+	if (ret != -ENOENT)
+		BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
@@ -1790,6 +1792,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
+	bi->log_dirty_trans = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3f4b139b27e..5d49a701bdc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1973,10 +1973,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	atomic_set(&log->fs_info->tree_log_commit, 1);
 
 	while(1) {
+		batch = log->fs_info->tree_log_batch;
 		mutex_unlock(&log->fs_info->tree_log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&log->fs_info->tree_log_mutex);
-		batch = log->fs_info->tree_log_batch;
 
 		while(atomic_read(&log->fs_info->tree_log_writers)) {
 			DEFINE_WAIT(wait);
@@ -2189,8 +2189,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 	end_log_trans(root);
 
-	if (ret == 0 || ret == -ENOENT)
-		return 0;
 	return ret;
 }
 
@@ -2620,9 +2618,11 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		else
 			break;
 	}
-	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
+	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
+		BTRFS_I(inode)->log_dirty_trans = 0;
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
-- 
cgit v1.2.3-70-g09d2


From 3a5f1d458ad1610a06e38f0be2fbc6ac215439c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:53:37 -0400
Subject: Btrfs: Optimize btree walking while logging inodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c  |  2 +-
 fs/btrfs/tree-log.c | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ecb74b72026..7c06eb4ecfd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1497,7 +1497,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * low idle thresh
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
-	fs_info->endio_write_workers.idle_thresh = 4;
+	fs_info->endio_write_workers.idle_thresh = 64;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5d49a701bdc..f43ee33ec2d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1982,7 +1982,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
-			batch = log->fs_info->tree_log_batch;
 			mutex_unlock(&log->fs_info->tree_log_mutex);
 			if (atomic_read(&log->fs_info->tree_log_writers))
 				schedule();
@@ -2024,8 +2023,7 @@ out:
 
 }
 
-/*
- * free all the extents used by the tree log.  This should be called
+/* * free all the extents used by the tree log.  This should be called
  * at commit time of the full transaction
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2107,6 +2105,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	int ret;
 	int bytes_del = 0;
 
+	if (BTRFS_I(dir)->logged_trans < trans->transid)
+		return 0;
+
 	ret = join_running_log_trans(root);
 	if (ret)
 		return 0;
@@ -2178,6 +2179,9 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	u64 index;
 	int ret;
 
+	if (BTRFS_I(inode)->logged_trans < trans->transid)
+		return 0;
+
 	ret = join_running_log_trans(root);
 	if (ret)
 		return 0;
@@ -2484,6 +2488,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_item *inode_item;
 	u32 size;
 	int ret;
+	int nritems;
 
 	log = root->log_root;
 
@@ -2541,12 +2546,11 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 					   path, 0, trans->transid);
 		if (ret != 0)
 			break;
-
+again:
 		if (min_key.objectid != inode->i_ino)
 			break;
 		if (min_key.type > max_key.type)
 			break;
-
 		src = path->nodes[0];
 		size = btrfs_item_size_nr(src, path->slots[0]);
 		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
@@ -2606,9 +2610,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
 
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+					      path->slots[0]);
+			goto again;
+		}
+		btrfs_release_path(root, path);
+
 		if (min_key.offset < (u64)-1)
 			min_key.offset++;
 		else if (min_key.type < (u8)-1)
@@ -2626,6 +2638,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
+	BTRFS_I(inode)->logged_trans = trans->transid;
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
-- 
cgit v1.2.3-70-g09d2


From d00aff00139b40f2e9c60299d76aac29d72e48ba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:54:42 -0400
Subject: Btrfs: Optimize tree log block allocations

Since tree log blocks get freed every transaction, they never really
need to be written to disk.  This skips the step where we update
metadata to record they were allocated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  5 ++---
 fs/btrfs/extent-tree.c | 19 ++++++++++++++-----
 fs/btrfs/tree-log.c    | 20 +++++++-------------
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7c06eb4ecfd..5edb7f88579 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -829,9 +829,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_level(eb) != 0);
 	WARN_ON(btrfs_header_nritems(eb) != 0);
 
-	ret = btrfs_free_extent(trans, fs_info->tree_root,
-				eb->start, eb->len,
-				BTRFS_TREE_LOG_OBJECTID, 0, 0, 0, 1);
+	ret = btrfs_free_reserved_extent(fs_info->tree_root,
+				eb->start, eb->len);
 	BUG_ON(ret);
 
 	free_extent_buffer(eb);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3181759da1c..c479d71e286 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1872,8 +1872,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 	/* if metadata always pin */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+			/* btrfs_free_reserved_extent */
+			set_extent_dirty(&root->fs_info->free_space_cache,
+				 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+			return 0;
+		}
 		pin = 1;
+	}
 
 	/* if data pin when any transaction has committed this */
 	if (ref_generation != trans->transid)
@@ -2361,11 +2368,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
 	BUG_ON(ret);
-	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-					    ref_generation, owner,
-					    owner_offset, ins);
-	BUG_ON(ret);
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+						    ref_generation, owner,
+						    owner_offset, ins);
+		BUG_ON(ret);
 
+	}
 	maybe_unlock_mutex(root);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f43ee33ec2d..5f77bee0f84 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1728,9 +1728,8 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_extent(trans, root, bytenr,
-							blocksize, root_owner,
-							root_gen, 0, 0, 1);
+				ret = btrfs_free_reserved_extent(root,
+							 bytenr, blocksize);
 				BUG_ON(ret);
 			}
 			free_extent_buffer(next);
@@ -1775,8 +1774,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					  root_owner, root_gen, 0, 0, 1);
+		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
 		BUG_ON(ret);
 	}
 	free_extent_buffer(path->nodes[*level]);
@@ -1837,10 +1835,9 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 				}
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_extent(trans, root,
+				ret = btrfs_free_reserved_extent(root,
 						path->nodes[*level]->start,
-						path->nodes[*level]->len,
-						root_owner, root_gen, 0, 0, 1);
+						path->nodes[*level]->len);
 				BUG_ON(ret);
 			}
 			free_extent_buffer(path->nodes[*level]);
@@ -1910,11 +1907,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			}
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
-			ret = btrfs_free_extent(trans, log,
-						next->start, next->len,
-						log->root_key.objectid,
-						btrfs_header_generation(next),
-						0, 0, 1);
+			ret = btrfs_free_reserved_extent(log, next->start,
+							 next->len);
 			BUG_ON(ret);
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 31ff1cd25d376e8f499d450de177dffadc9e1c56 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 16:17:57 -0400
Subject: Btrfs: Copy into the log tree in big batches

This changes the log tree copy code to use btrfs_insert_items and
to work in larger batches where possible.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 183 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 122 insertions(+), 61 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5f77bee0f84..ae96451bc22 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2452,6 +2452,94 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log,
+			       struct btrfs_path *dst_path,
+			       struct extent_buffer *src,
+			       int start_slot, int nr, int inode_only)
+{
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	int ret;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	char *ins_data;
+	int i;
+
+	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+			   nr * sizeof(u32), GFP_NOFS);
+	ins_sizes = (u32 *)ins_data;
+	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+	for (i = 0; i < nr; i++) {
+		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+	}
+	ret = btrfs_insert_empty_items(trans, log, dst_path,
+				       ins_keys, ins_sizes, nr);
+	BUG_ON(ret);
+
+	for (i = 0; i < nr; i++) {
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, ins_sizes[i]);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, start_slot + i,
+						struct btrfs_file_extent_item);
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   BTRFS_TREE_LOG_OBJECTID,
+						   0, ins_keys[i].objectid,
+						   ins_keys[i].offset);
+					BUG_ON(ret);
+				}
+			}
+		}
+		dst_path->slots[0]++;
+	}
+
+	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_release_path(log, dst_path);
+	kfree(ins_data);
+	return 0;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -2475,14 +2563,12 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key min_key;
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
-	unsigned long src_offset;
-	unsigned long dst_offset;
-	struct extent_buffer *src;
-	struct btrfs_file_extent_item *extent;
-	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *src = NULL;
 	u32 size;
 	int ret;
 	int nritems;
+	int ins_start_slot = 0;
+	int ins_nr;
 
 	log = root->log_root;
 
@@ -2536,75 +2622,35 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	path->keep_locks = 1;
 
 	while(1) {
+		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
 					   path, 0, trans->transid);
 		if (ret != 0)
 			break;
 again:
+		/* note, ins_nr might be > 0 here, cleanup outside the loop */
 		if (min_key.objectid != inode->i_ino)
 			break;
 		if (min_key.type > max_key.type)
 			break;
+
 		src = path->nodes[0];
 		size = btrfs_item_size_nr(src, path->slots[0]);
-		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
-					      size);
-		if (ret)
-			BUG();
-
-		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
-						   dst_path->slots[0]);
-
-		src_offset = btrfs_item_ptr_offset(src, path->slots[0]);
-
-		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-				   src_offset, size);
-
-		if (inode_only == LOG_INODE_EXISTS &&
-		    min_key.type == BTRFS_INODE_ITEM_KEY) {
-			inode_item = btrfs_item_ptr(dst_path->nodes[0],
-						    dst_path->slots[0],
-						    struct btrfs_inode_item);
-			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
-			/* set the generation to zero so the recover code
-			 * can tell the difference between an logging
-			 * just to say 'this inode exists' and a logging
-			 * to say 'update this inode with these values'
-			 */
-			btrfs_set_inode_generation(dst_path->nodes[0],
-						   inode_item, 0);
-		}
-		/* take a reference on file data extents so that truncates
-		 * or deletes of this inode don't have to relog the inode
-		 * again
-		 */
-		if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) {
-			int found_type;
-			extent = btrfs_item_ptr(src, path->slots[0],
-						struct btrfs_file_extent_item);
-
-			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
-				u64 ds = btrfs_file_extent_disk_bytenr(src,
-								   extent);
-				u64 dl = btrfs_file_extent_disk_num_bytes(src,
-								      extent);
-				/* ds == 0 is a hole */
-				if (ds != 0) {
-					ret = btrfs_inc_extent_ref(trans, log,
-						   ds, dl,
-						   log->root_key.objectid,
-						   0,
-						   inode->i_ino,
-						   min_key.offset);
-					BUG_ON(ret);
-				}
-			}
+		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+			ins_nr++;
+			goto next_slot;
+		} else if (!ins_nr) {
+			ins_start_slot = path->slots[0];
+			ins_nr = 1;
+			goto next_slot;
 		}
 
-		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-		btrfs_release_path(log, dst_path);
+		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+				 ins_nr, inode_only);
+		BUG_ON(ret);
+		ins_nr = 1;
+		ins_start_slot = path->slots[0];
+next_slot:
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		path->slots[0]++;
@@ -2613,6 +2659,13 @@ again:
 					      path->slots[0]);
 			goto again;
 		}
+		if (ins_nr) {
+			ret = copy_items(trans, log, dst_path, src,
+					 ins_start_slot,
+					 ins_nr, inode_only);
+			BUG_ON(ret);
+			ins_nr = 0;
+		}
 		btrfs_release_path(root, path);
 
 		if (min_key.offset < (u64)-1)
@@ -2624,6 +2677,14 @@ again:
 		else
 			break;
 	}
+	if (ins_nr) {
+		ret = copy_items(trans, log, dst_path, src,
+				 ins_start_slot,
+				 ins_nr, inode_only);
+		BUG_ON(ret);
+		ins_nr = 0;
+	}
+	WARN_ON(ins_nr);
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
 	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
 		btrfs_release_path(root, path);
-- 
cgit v1.2.3-70-g09d2


From d0c803c4049c5ca322d4795d8b74f28768603e0e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 16:17:57 -0400
Subject: Btrfs: Record dirty pages tree-log pages in an extent_io tree

This is the same way the transaction code makes sure that all the
other tree blocks are safely on disk.  There's an extent_io tree
for each root, and any blocks allocated to the tree logs are
recorded in that tree.

At tree-log sync, the extent_io tree is walked to flush down the
dirty pages and wait for them.

The main benefit is less time spent walking the tree log and skipping
clean pages, and getting sequential IO down to the drive.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/disk-io.c     | 17 +++++++++++++++--
 fs/btrfs/extent-tree.c |  7 ++++++-
 fs/btrfs/transaction.c | 21 ++++++++++++++-------
 fs/btrfs/transaction.h |  2 ++
 fs/btrfs/tree-log.c    | 29 +++++++++++++++--------------
 6 files changed, 54 insertions(+), 24 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2ed6918f32e..eb65fd80888 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -668,6 +668,8 @@ struct btrfs_root {
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
 	struct inode *inode;
+	struct extent_io_tree dirty_log_pages;
+
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5edb7f88579..57fbf107e59 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -777,6 +777,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	extent_io_tree_init(&root->dirty_log_pages,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
 	root->ref_tree = &root->ref_tree_struct;
@@ -819,11 +821,23 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info)
 {
 	struct extent_buffer *eb;
+	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+	u64 start = 0;
+	u64 end = 0;
 	int ret;
 
-	if (!fs_info->log_root_tree)
+	if (!log_root_tree)
 		return 0;
 
+	while(1) {
+		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+				    0, &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(&log_root_tree->dirty_log_pages,
+				   start, end, GFP_NOFS);
+	}
 	eb = fs_info->log_root_tree->node;
 
 	WARN_ON(btrfs_header_level(eb) != 0);
@@ -1412,7 +1426,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
 	insert_inode_hash(fs_info->btree_inode);
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c479d71e286..c0bb6b9ac4c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2392,8 +2392,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 	btrfs_set_buffer_uptodate(buf);
-	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		set_extent_dirty(&root->dirty_log_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	} else {
+		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
+	}
 	trans->blocks_used++;
 	return buf;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 61a377bcb2f..151b00d5259 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -302,23 +302,18 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 }
 
 
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root)
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages)
 {
 	int ret;
 	int err = 0;
 	int werr = 0;
-	struct extent_io_tree *dirty_pages;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 start = 0;
 	u64 end;
 	unsigned long index;
 
-	if (!trans || !trans->transaction) {
-		return filemap_write_and_wait(btree_inode->i_mapping);
-	}
-	dirty_pages = &trans->transaction->dirty_pages;
 	while(1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 					    EXTENT_DIRTY);
@@ -385,6 +380,18 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root)
+{
+	if (!trans || !trans->transaction) {
+		struct inode *btree_inode;
+		btree_inode = root->fs_info->btree_inode;
+		return filemap_write_and_wait(btree_inode->i_mapping);
+	}
+	return btrfs_write_and_wait_marked_extents(root,
+					   &trans->transaction->dirty_pages);
+}
+
 static int update_cowonly_root(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index cc63650d60d..eef2cb7d7e7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -99,4 +99,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ae96451bc22..bfa71080096 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1954,10 +1954,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	int ret;
 	unsigned long batch;
 	struct btrfs_root *log = root->log_root;
-	struct walk_control wc = {
-		.write = 1,
-		.process_func = process_one_buffer
-	};
 
 	mutex_lock(&log->fs_info->tree_log_mutex);
 	if (atomic_read(&log->fs_info->tree_log_commit)) {
@@ -1985,18 +1981,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		if (batch == log->fs_info->tree_log_batch)
 			break;
 	}
-	ret = walk_log_tree(trans, log, &wc);
-	BUG_ON(ret);
-
-	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
-	BUG_ON(ret);
-
-	wc.wait = 1;
 
-	ret = walk_log_tree(trans, log, &wc);
+	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
-
-	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
+			       &root->fs_info->log_root_tree->dirty_log_pages);
 	BUG_ON(ret);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
@@ -2025,6 +2014,8 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	int ret;
 	struct btrfs_root *log;
 	struct key;
+	u64 start;
+	u64 end;
 	struct walk_control wc = {
 		.free = 1,
 		.process_func = process_one_buffer
@@ -2037,6 +2028,16 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	ret = walk_log_tree(trans, log, &wc);
 	BUG_ON(ret);
 
+	while(1) {
+		ret = find_first_extent_bit(&log->dirty_log_pages,
+				    0, &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(&log->dirty_log_pages,
+				   start, end, GFP_NOFS);
+	}
+
 	log = root->log_root;
 	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
 			     &log->root_key);
-- 
cgit v1.2.3-70-g09d2


From 9623f9a3894ec95fe140ee5460bb840ac53bc6a0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 17:42:42 -0400
Subject: Btrfs: Disable the dir fsync optimization to skip logging the dir
 sometimes

More testing has turned up a bug, disable this for now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index bfa71080096..56de3fb2d8d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2686,8 +2686,7 @@ next_slot:
 		ins_nr = 0;
 	}
 	WARN_ON(ins_nr);
-	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
-	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
 		BTRFS_I(inode)->log_dirty_trans = 0;
-- 
cgit v1.2.3-70-g09d2


From 31840ae1a6b433ca0e6a8d341756ff478bbf959e Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Tue, 23 Sep 2008 13:14:14 -0400
Subject: Btrfs: Full back reference support

This patch makes the back reference system to explicit record the
location of parent node for all types of extents. The location of
parent node is placed into the offset field of backref key. Every
time a tree block is balanced, the back references for the affected
lower level extents are updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  228 ++++++----
 fs/btrfs/ctree.h       |   62 ++-
 fs/btrfs/disk-io.c     |    4 +-
 fs/btrfs/extent-tree.c | 1143 +++++++++++++++++++++++++++++-------------------
 fs/btrfs/extent_io.c   |    3 +-
 fs/btrfs/file.c        |  121 +++--
 fs/btrfs/inode.c       |   57 ++-
 fs/btrfs/ioctl.c       |   57 +--
 fs/btrfs/print-tree.c  |    5 +-
 fs/btrfs/tree-log.c    |  108 ++---
 10 files changed, 1066 insertions(+), 722 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6f467901246..50aea8cb653 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -125,7 +125,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int ret = 0;
 	int level;
-	struct btrfs_key first_key;
 	struct btrfs_root *new_root;
 
 	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
@@ -141,18 +140,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	if (nritems) {
-		if (level == 0)
-			btrfs_item_key_to_cpu(buf, &first_key, 0);
-		else
-			btrfs_node_key_to_cpu(buf, &first_key, 0);
-	} else {
-		first_key.objectid = 0;
-	}
-	cow = btrfs_alloc_free_block(trans, new_root, buf->len,
-				       new_root_objectid,
-				       trans->transid, first_key.objectid,
-				       level, buf->start, 0);
+
+	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+				     new_root_objectid, trans->transid,
+				     level, buf->start, 0);
 	if (IS_ERR(cow)) {
 		kfree(new_root);
 		return PTR_ERR(cow);
@@ -165,7 +156,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf, 0);
+	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
 	kfree(new_root);
 
 	if (ret)
@@ -184,39 +175,31 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     u64 search_start, u64 empty_size,
 			     u64 prealloc_dest)
 {
-	u64 root_gen;
+	u64 parent_start;
 	struct extent_buffer *cow;
 	u32 nritems;
 	int ret = 0;
 	int different_trans = 0;
 	int level;
 	int unlock_orig = 0;
-	struct btrfs_key first_key;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
 	WARN_ON(!btrfs_tree_locked(buf));
 
-	if (root->ref_cows) {
-		root_gen = trans->transid;
-	} else {
-		root_gen = 0;
-	}
+	if (parent)
+		parent_start = parent->start;
+	else
+		parent_start = 0;
+
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	if (nritems) {
-		if (level == 0)
-			btrfs_item_key_to_cpu(buf, &first_key, 0);
-		else
-			btrfs_node_key_to_cpu(buf, &first_key, 0);
-	} else {
-		first_key.objectid = 0;
-	}
+
 	if (prealloc_dest) {
 		struct btrfs_key ins;
 
@@ -224,19 +207,19 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		ins.offset = buf->len;
 		ins.type = BTRFS_EXTENT_ITEM_KEY;
 
-		ret = btrfs_alloc_reserved_extent(trans, root,
+		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
 						  root->root_key.objectid,
-						  root_gen, level,
-						  first_key.objectid,
+						  trans->transid, level, 0,
 						  &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
 					    buf->len);
 	} else {
 		cow = btrfs_alloc_free_block(trans, root, buf->len,
+					     parent_start,
 					     root->root_key.objectid,
-					     root_gen, first_key.objectid,
-					     level, search_start, empty_size);
+					     trans->transid, level,
+					     search_start, empty_size);
 	}
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
@@ -249,17 +232,23 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
+		u32 nr_extents;
 		different_trans = 1;
-		ret = btrfs_inc_ref(trans, root, buf, 1);
+		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
 		if (ret)
 			return ret;
+
+		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+		WARN_ON(ret);
 	} else {
+		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+		if (ret)
+			return ret;
 		clean_tree_block(trans, root, buf);
 	}
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
-		root_gen = btrfs_header_generation(buf);
 
 		spin_lock(&root->node_lock);
 		root->node = cow;
@@ -268,13 +257,14 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 		if (buf != root->commit_root) {
 			btrfs_free_extent(trans, root, buf->start,
-					  buf->len, root->root_key.objectid,
-					  root_gen, 0, 0, 1);
+					  buf->len, buf->start,
+					  root->root_key.objectid,
+					  btrfs_header_generation(buf),
+					  0, 0, 1);
 		}
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
-		root_gen = btrfs_header_generation(parent);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		WARN_ON(trans->transid == 0);
@@ -283,8 +273,8 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_mark_buffer_dirty(parent);
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  btrfs_header_owner(parent), root_gen,
-				  0, 0, 1);
+				  parent_start, btrfs_header_owner(parent),
+				  btrfs_header_generation(parent), 0, 0, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -831,6 +821,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		root->node = child;
 		spin_unlock(&root->node_lock);
 
+		ret = btrfs_update_extent_ref(trans, root, child->start,
+					      mid->start, child->start,
+					      root->root_key.objectid,
+					      trans->transid, level - 1, 0);
+		BUG_ON(ret);
+
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
 		path->locks[level] = 0;
@@ -840,7 +836,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-					root->root_key.objectid,
+					mid->start, root->root_key.objectid,
 					btrfs_header_generation(mid), 0, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
@@ -905,7 +901,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize,
+						 blocksize, parent->start,
 						 btrfs_header_owner(parent),
 						 generation, 0, 0, 1);
 			if (wret)
@@ -954,6 +950,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					 parent->start,
 					 btrfs_header_owner(parent),
 					 root_gen, 0, 0, 1);
 		if (wret)
@@ -1499,6 +1496,41 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	if (slot > 0) {
+		btrfs_item_key(eb, &disk_key, slot - 1);
+		if (comp_keys(&disk_key, new_key) >= 0)
+			return -1;
+	}
+	if (slot < btrfs_header_nritems(eb) - 1) {
+		btrfs_item_key(eb, &disk_key, slot + 1);
+		if (comp_keys(&disk_key, new_key) <= 0)
+			return -1;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(eb, &disk_key, slot);
+	btrfs_mark_buffer_dirty(eb);
+	if (slot == 0)
+		fixup_low_keys(trans, root, path, &disk_key, 1);
+	return 0;
+}
+
 /*
  * try to push data from one node into the next node left in the
  * tree.
@@ -1558,6 +1590,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
+
+	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+	BUG_ON(ret);
+
 	return ret;
 }
 
@@ -1619,6 +1655,10 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
+
+	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+	BUG_ON(ret);
+
 	return ret;
 }
 
@@ -1633,30 +1673,24 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
-	u64 root_gen;
 	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
 	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
+	int ret;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	if (root->ref_cows)
-		root_gen = trans->transid;
-	else
-		root_gen = 0;
-
 	lower = path->nodes[level-1];
 	if (level == 1)
 		btrfs_item_key(lower, &lower_key, 0);
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = btrfs_alloc_free_block(trans, root, root->nodesize,
-				   root->root_key.objectid,
-				   root_gen, le64_to_cpu(lower_key.objectid),
+	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+				   root->root_key.objectid, trans->transid,
 				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
@@ -1679,7 +1713,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
 	lower_gen = btrfs_header_generation(lower);
-	WARN_ON(lower_gen == 0);
+	WARN_ON(lower_gen != trans->transid);
 
 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
 
@@ -1690,6 +1724,12 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	root->node = c;
 	spin_unlock(&root->node_lock);
 
+	ret = btrfs_update_extent_ref(trans, root, lower->start,
+				      lower->start, c->start,
+				      root->root_key.objectid,
+				      trans->transid, level - 1, 0);
+	BUG_ON(ret);
+
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(old);
 
@@ -1698,20 +1738,6 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	path->nodes[level] = c;
 	path->locks[level] = 1;
 	path->slots[level] = 0;
-
-	if (root->ref_cows && lower_gen != trans->transid) {
-		struct btrfs_path *back_path = btrfs_alloc_path();
-		int ret;
-		mutex_lock(&root->fs_info->alloc_mutex);
-		ret = btrfs_insert_extent_backref(trans,
-						  root->fs_info->extent_root,
-						  path, lower->start,
-						  root->root_key.objectid,
-						  trans->transid, 0, 0);
-		BUG_ON(ret);
-		mutex_unlock(&root->fs_info->alloc_mutex);
-		btrfs_free_path(back_path);
-	}
 	return 0;
 }
 
@@ -1766,7 +1792,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_path *path, int level)
 {
-	u64 root_gen;
 	struct extent_buffer *c;
 	struct extent_buffer *split;
 	struct btrfs_disk_key disk_key;
@@ -1793,17 +1818,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	}
 
 	c_nritems = btrfs_header_nritems(c);
-	if (root->ref_cows)
-		root_gen = trans->transid;
-	else
-		root_gen = 0;
 
-	btrfs_node_key(c, &disk_key, 0);
 	split = btrfs_alloc_free_block(trans, root, root->nodesize,
-					 root->root_key.objectid,
-					 root_gen,
-					 btrfs_disk_key_objectid(&disk_key),
-					 level, c->start, 0);
+					path->nodes[level + 1]->start,
+					root->root_key.objectid,
+					trans->transid, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -1840,6 +1859,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	if (wret)
 		ret = wret;
 
+	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+	BUG_ON(ret);
+
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		btrfs_tree_unlock(c);
@@ -1955,10 +1977,23 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	else
 		nr = 1;
 
+	if (path->slots[0] >= left_nritems)
+		push_space += data_size + sizeof(*item);
+
 	i = left_nritems - 1;
 	while (i >= nr) {
 		item = btrfs_item_nr(left, i);
 
+		if (!empty && push_items > 0) {
+			if (path->slots[0] > i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, left);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
 
@@ -1973,6 +2008,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		this_item_size = btrfs_item_size(left, item);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
+
 		push_items++;
 		push_space += this_item_size + sizeof(*item);
 		if (i == 0)
@@ -2046,6 +2082,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_mark_buffer_dirty(left);
 	btrfs_mark_buffer_dirty(right);
 
+	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+	BUG_ON(ret);
+
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
 	btrfs_mark_buffer_dirty(upper);
@@ -2147,6 +2186,16 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 					KM_USER1);
 		}
 
+		if (!empty && push_items > 0) {
+			if (path->slots[0] < i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, right);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
 
@@ -2255,6 +2304,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 
+	ret = btrfs_update_ref(trans, root, right, left,
+			       old_left_nritems, push_items);
+	BUG_ON(ret);
+
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	if (wret)
@@ -2294,7 +2347,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			       struct btrfs_path *path, int data_size,
 			       int extend)
 {
-	u64 root_gen;
 	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
@@ -2313,11 +2365,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	if (extend)
 		space_needed = data_size;
 
-	if (root->ref_cows)
-		root_gen = trans->transid;
-	else
-		root_gen = 0;
-
 	/* first try to make some room by pushing left and right */
 	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
@@ -2348,13 +2395,10 @@ again:
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1)/ 2;
 
-	btrfs_item_key(l, &disk_key, 0);
-
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
-					 root->root_key.objectid,
-					 root_gen,
-					 le64_to_cpu(disk_key.objectid),
-					 0, l->start, 0);
+					path->nodes[1]->start,
+					root->root_key.objectid,
+					trans->transid, 0, l->start, 0);
 	if (IS_ERR(right)) {
 		BUG_ON(1);
 		return PTR_ERR(right);
@@ -2485,6 +2529,9 @@ again:
 	btrfs_mark_buffer_dirty(l);
 	BUG_ON(path->slots[0] != slot);
 
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	BUG_ON(ret);
+
 	if (mid <= slot) {
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
@@ -2956,6 +3003,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
 					 leaf->start, leaf->len,
+					 path->nodes[1]->start,
 					 btrfs_header_owner(path->nodes[1]),
 					 root_gen, 0, 0, 1);
 			if (wret)
@@ -3007,7 +3055,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 				free_extent_buffer(leaf);
 				wret = btrfs_free_extent(trans, root, bytenr,
-					     blocksize,
+					     blocksize, path->nodes[1]->start,
 					     btrfs_header_owner(path->nodes[1]),
 					     root_gen, 0, 0, 1);
 				if (wret)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 730aae3bc18..138c157bbc4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,7 +40,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_B8RfS_M"
+#define BTRFS_MAGIC "_B9RfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
@@ -81,6 +81,9 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_LOG_OBJECTID -6ULL
 #define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
 
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
 /*
  * All files have objectids in this range.
  */
@@ -369,6 +372,7 @@ struct btrfs_extent_ref {
 	__le64 generation;
 	__le64 objectid;
 	__le64 offset;
+	__le32 num_refs;
 } __attribute__ ((__packed__));
 
 /* dev extents record free space on individual devices.  The owner
@@ -1047,9 +1051,6 @@ btrfs_inode_otime(struct btrfs_inode_item *inode_item)
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 
-/* struct btrfs_extent_item */
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
 		   chunk_tree, 64);
@@ -1070,14 +1071,20 @@ BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
 BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
 BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
 
 BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
 			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref,
+			 offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+			 num_refs, 32);
 
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
 			 refs, 32);
 
@@ -1474,8 +1481,7 @@ static inline struct dentry *fdentry(struct file *file) {
 }
 
 /* extent-tree.c */
-int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
-			u64 start, u64 len);
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1495,10 +1501,9 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 int data, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
-					     u32 blocksize,
+					     u32 blocksize, u64 parent,
 					     u64 root_objectid,
 					     u64 ref_generation,
-					     u64 first_objectid,
 					     int level,
 					     u64 hint,
 					     u64 empty_size);
@@ -1508,23 +1513,24 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 struct btrfs_path *path, u64 bytenr,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 parent,
 				 u64 root_objectid, u64 ref_generation,
 				 u64 owner, u64 owner_offset);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_bytes,
+		       u64 num_bytes, u64 parent, u64 min_bytes,
 		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins);
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins);
@@ -1535,9 +1541,16 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int cache_ref);
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 bytenr, u64 num_bytes,
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
 		      u64 root_objectid, u64 ref_generation,
 		      u64 owner_objectid, u64 owner_offset, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -1545,10 +1558,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_io_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset);
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner, u64 owner_offset);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 orig_parent, u64 parent,
+			    u64 root_objectid, u64 ref_generation,
+			    u64 owner, u64 owner_offset);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
@@ -1561,7 +1579,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 25be96946a2..d35ca6a3f51 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -882,8 +882,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 	root->ref_cows = 0;
 
 	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
-					    BTRFS_TREE_LOG_OBJECTID,
-					    0, 0, 0, 0, 0);
+					    0, BTRFS_TREE_LOG_OBJECTID,
+					    trans->transid, 0, 0, 0);
 
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 558fbe40736..5258923d621 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,6 +29,21 @@
 #include "locking.h"
 #include "ref-cache.h"
 
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+
+struct pending_extent_op {
+	int type;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
+	u64 orig_parent;
+	u64 generation;
+	u64 orig_generation;
+	int level;
+};
+
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -487,48 +502,15 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	return ret;
 }
 
-static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
-			   u64 owner, u64 owner_offset)
-{
-	u32 high_crc = ~(u32)0;
-	u32 low_crc = ~(u32)0;
-	__le64 lenum;
-	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
-	lenum = cpu_to_le64(ref_generation);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
-	if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
-		lenum = cpu_to_le64(owner);
-		low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
-		lenum = cpu_to_le64(owner_offset);
-		low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
-	}
-	return ((u64)high_crc << 32) | (u64)low_crc;
-}
-
-static int match_extent_ref(struct extent_buffer *leaf,
-			    struct btrfs_extent_ref *disk_ref,
-			    struct btrfs_extent_ref *cpu_ref)
-{
-	int ret;
-	int len;
-
-	if (cpu_ref->objectid)
-		len = sizeof(*cpu_ref);
-	else
-		len = 2 * sizeof(u64);
-	ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
-				   len);
-	return ret == 0;
-}
-
 /* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
-			u64 start, u64 len)
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	int ret;
 	struct btrfs_key key;
+	struct btrfs_path *path;
 
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 	maybe_lock_mutex(root);
 	key.objectid = start;
 	key.offset = len;
@@ -536,72 +518,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
 	maybe_unlock_mutex(root);
-	return ret;
-}
-
-static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path, u64 bytenr,
-					  u64 root_objectid,
-					  u64 ref_generation, u64 owner,
-					  u64 owner_offset, int del)
-{
-	u64 hash;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct btrfs_extent_ref ref;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *disk_ref;
-	int ret;
-	int ret2;
-
-	btrfs_set_stack_ref_root(&ref, root_objectid);
-	btrfs_set_stack_ref_generation(&ref, ref_generation);
-	btrfs_set_stack_ref_objectid(&ref, owner);
-	btrfs_set_stack_ref_offset(&ref, owner_offset);
-
-	hash = hash_extent_ref(root_objectid, ref_generation, owner,
-			       owner_offset);
-	key.offset = hash;
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-
-	while (1) {
-		ret = btrfs_search_slot(trans, root, &key, path,
-					del ? -1 : 0, del);
-		if (ret < 0)
-			goto out;
-		leaf = path->nodes[0];
-		if (ret != 0) {
-			u32 nritems = btrfs_header_nritems(leaf);
-			if (path->slots[0] >= nritems) {
-				ret2 = btrfs_next_leaf(root, path);
-				if (ret2)
-					goto out;
-				leaf = path->nodes[0];
-			}
-			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-			if (found_key.objectid != bytenr ||
-			    found_key.type != BTRFS_EXTENT_REF_KEY)
-				goto out;
-			key.offset = found_key.offset;
-			if (del) {
-				btrfs_release_path(root, path);
-				continue;
-			}
-		}
-		disk_ref = btrfs_item_ptr(path->nodes[0],
-					  path->slots[0],
-					  struct btrfs_extent_ref);
-		if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
-			ret = 0;
-			goto out;
-		}
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		key.offset = found_key.offset + 1;
-		btrfs_release_path(root, path);
-	}
-out:
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -622,7 +539,7 @@ out:
  * File extents can be referenced by:
  *
  * - multiple snapshots, subvolumes, or different generations in one subvol
- * - different files inside a single subvolume (in theory, not implemented yet)
+ * - different files inside a single subvolume
  * - different offsets inside a file (bookend extents in file.c)
  *
  * The extent ref structure has fields for:
@@ -631,119 +548,284 @@ out:
  * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
  * - offset in the file corresponding to the key holding the reference
+ * - number of references holding by parent node (alway 1 for tree blocks)
+ *
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together. So inode objectid and offset in file are
+ * just hints, they provide hints about where in the btree the references
+ * can be found and when we can stop searching.
  *
  * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, offset in file)
+ *     (root_key.objectid, trans->transid, inode objectid, offset in file, 1)
  *
  * When a leaf is cow'd new references are added for every file extent found
- * in the leaf.  It looks the same as the create case, but trans->transid
- * will be different when the block is cow'd.
+ * in the leaf.  It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
  *
- *     (root_key.objectid, trans->transid, inode objectid, offset in file)
+ *     (root_key.objectid, trans->transid, inode objectid, offset in file,
+ *      number of references in the leaf)
  *
- * When a file extent is removed either during snapshot deletion or file
- * truncation, the corresponding back reference is found
- * by searching for:
+ * Because inode objectid and offset in file are just hints, they are not
+ * used when backrefs are deleted. When a file extent is removed either
+ * during snapshot deletion or file truncation, we find the corresponding
+ * back back reference and check the following fields.
  *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
- *      inode objectid, offset in file)
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf))
  *
  * Btree extents can be referenced by:
  *
  * - Different subvolumes
  * - Different generations of the same subvolume
  *
- * Storing sufficient information for a full reverse mapping of a btree
- * block would require storing the lowest key of the block in the backref,
- * and it would require updating that lowest key either before write out or
- * every time it changed.  Instead, the objectid of the lowest key is stored
- * along with the level of the tree block.  This provides a hint
- * about where in the btree the block can be found.  Searches through the
- * btree only need to look for a pointer to that block, so they stop one
- * level higher than the level recorded in the backref.
- *
- * Some btrees do not do reference counting on their extents.  These
- * include the extent tree and the tree of tree roots.  Backrefs for these
- * trees always have a generation of zero.
- *
  * When a tree block is created, back references are inserted:
  *
- * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
+ * (root->root_key.objectid, trans->transid, level, 0, 1)
  *
- * When a tree block is cow'd in a reference counted root,
- * new back references are added for all the blocks it points to.
- * These are of the form (trans->transid will have increased since creation):
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
  *
- * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
+ * (root->root_key.objectid, trans->transid, level, 0, 1)
  *
- * Because the lowest_key_objectid and the level are just hints
- * they are not used when backrefs are deleted.  When a backref is deleted:
+ * When a backref is in deleting, the following fields are checked:
  *
  * if backref was for a tree root:
- *     root_objectid = root->root_key.objectid
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself))
  * else
- *     root_objectid = btrfs_header_owner(parent)
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent))
  *
- * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
+ * Back Reference Key composing:
  *
- * Back Reference Key hashing:
- *
- * Back references have four fields, each 64 bits long.  Unfortunately,
- * This is hashed into a single 64 bit number and placed into the key offset.
- * The key objectid corresponds to the first byte in the extent, and the
- * key type is set to BTRFS_EXTENT_REF_KEY
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
  */
-int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path, u64 bytenr,
-				 u64 root_objectid, u64 ref_generation,
-				 u64 owner, u64 owner_offset)
+
+static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 bytenr,
+					  u64 parent, u64 ref_root,
+					  u64 ref_generation, int del)
 {
-	u64 hash;
 	struct btrfs_key key;
-	struct btrfs_extent_ref ref;
-	struct btrfs_extent_ref *disk_ref;
+	struct btrfs_extent_ref *ref;
+	struct extent_buffer *leaf;
 	int ret;
 
-	btrfs_set_stack_ref_root(&ref, root_objectid);
-	btrfs_set_stack_ref_generation(&ref, ref_generation);
-	btrfs_set_stack_ref_objectid(&ref, owner);
-	btrfs_set_stack_ref_offset(&ref, owner_offset);
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
+
+	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != ref_generation) {
+		ret = -EIO;
+		WARN_ON(1);
+		goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid, u64 owner_offset)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
+	int ret;
 
-	hash = hash_extent_ref(root_objectid, ref_generation, owner,
-			       owner_offset);
-	key.offset = hash;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
 
-	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
-	while (ret == -EEXIST) {
-		disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					  struct btrfs_extent_ref);
-		if (match_extent_ref(path->nodes[0], disk_ref, &ref))
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		btrfs_set_ref_root(leaf, ref, ref_root);
+		btrfs_set_ref_generation(leaf, ref, ref_generation);
+		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+		btrfs_set_ref_offset(leaf, ref, owner_offset);
+		btrfs_set_ref_num_refs(leaf, ref, 1);
+	} else if (ret == -EEXIST) {
+		u64 existing_owner;
+		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		if (btrfs_ref_root(leaf, ref) != ref_root ||
+		    btrfs_ref_generation(leaf, ref) != ref_generation) {
+			ret = -EIO;
+			WARN_ON(1);
 			goto out;
-		key.offset++;
-		btrfs_release_path(root, path);
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-					      sizeof(ref));
-	}
-	if (ret)
+		}
+
+		num_refs = btrfs_ref_num_refs(leaf, ref);
+		BUG_ON(num_refs == 0);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+
+		existing_owner = btrfs_ref_objectid(leaf, ref);
+		if (existing_owner == owner_objectid &&
+		    btrfs_ref_offset(leaf, ref) > owner_offset) {
+			btrfs_set_ref_offset(leaf, ref, owner_offset);
+		} else if (existing_owner != owner_objectid &&
+			   existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+			btrfs_set_ref_objectid(leaf, ref,
+					BTRFS_MULTIPLE_OBJECTIDS);
+			btrfs_set_ref_offset(leaf, ref, 0);
+		}
+		ret = 0;
+	} else {
 		goto out;
-	disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				  struct btrfs_extent_ref);
-	write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
-			    sizeof(ref));
+	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
 	return ret;
 }
 
+static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	num_refs = btrfs_ref_num_refs(leaf, ref);
+	BUG_ON(num_refs == 0);
+	num_refs -= 1;
+	if (num_refs == 0) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		btrfs_set_ref_num_refs(leaf, ref, num_refs);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 bytenr,
+				     u64 orig_parent, u64 parent,
+				     u64 orig_root, u64 ref_root,
+				     u64 orig_generation, u64 ref_generation,
+				     u64 owner_objectid, u64 owner_offset)
+{
+	int ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+
+	if (root == root->fs_info->extent_root) {
+		struct pending_extent_op *extent_op;
+		u64 num_bytes;
+
+		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+							(unsigned long)priv;
+			BUG_ON(extent_op->parent != orig_parent);
+			BUG_ON(extent_op->generation != orig_generation);
+			extent_op->parent = parent;
+			extent_op->generation = ref_generation;
+		} else {
+			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+			BUG_ON(!extent_op);
+
+			extent_op->type = PENDING_BACKREF_UPDATE;
+			extent_op->bytenr = bytenr;
+			extent_op->num_bytes = num_bytes;
+			extent_op->parent = parent;
+			extent_op->orig_parent = orig_parent;
+			extent_op->generation = ref_generation;
+			extent_op->orig_generation = orig_generation;
+			extent_op->level = (int)owner_objectid;
+
+			set_extent_bits(&root->fs_info->extent_ins,
+					bytenr, bytenr + num_bytes - 1,
+					EXTENT_LOCKED, GFP_NOFS);
+			set_state_private(&root->fs_info->extent_ins,
+					  bytenr, (unsigned long)extent_op);
+		}
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, orig_parent, orig_root,
+				    orig_generation, 1);
+	if (ret)
+		goto out;
+	ret = remove_extent_backref(trans, extent_root, path);
+	if (ret)
+		goto out;
+	ret = insert_extent_backref(trans, extent_root, path, bytenr,
+				    parent, ref_root, ref_generation,
+				    owner_objectid, owner_offset);
+	BUG_ON(ret);
+	finish_current_insert(trans, extent_root);
+	del_pending_extents(trans, extent_root);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 orig_parent, u64 parent,
+			    u64 ref_root, u64 ref_generation,
+			    u64 owner_objectid, u64 owner_offset)
+{
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+	maybe_lock_mutex(root);
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+					parent, ref_root, ref_root,
+					ref_generation, ref_generation,
+					owner_objectid, owner_offset);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset)
+				  struct btrfs_root *root, u64 bytenr,
+				  u64 orig_parent, u64 parent,
+				  u64 orig_root, u64 ref_root,
+				  u64 orig_generation, u64 ref_generation,
+				  u64 owner_objectid, u64 owner_offset)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -752,24 +834,28 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 	u32 refs;
 
-	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	path->reada = 1;
 	key.objectid = bytenr;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = num_bytes;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
 	if (ret < 0)
 		return ret;
-	if (ret != 0) {
-		BUG();
-	}
-	BUG_ON(ret != 0);
+	BUG_ON(ret == 0 || path->slots[0] == 0);
+
+	path->slots[0]--;
 	l = path->nodes[0];
+
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	BUG_ON(key.objectid != bytenr);
+	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(l, item);
 	btrfs_set_extent_refs(l, item, refs + 1);
@@ -778,9 +864,10 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
-	ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
-					  path, bytenr, root_objectid,
-					  ref_generation, owner, owner_offset);
+	ret = insert_extent_backref(trans, root->fs_info->extent_root,
+				    path, bytenr, parent,
+				    ref_root, ref_generation,
+				    owner_objectid, owner_offset);
 	BUG_ON(ret);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
@@ -790,18 +877,20 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset)
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 ref_root, u64 ref_generation,
+			 u64 owner_objectid, u64 owner_offset)
 {
 	int ret;
-
-	mutex_lock(&root->fs_info->alloc_mutex);
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
-				     root_objectid, ref_generation,
-				     owner, owner_offset);
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+	maybe_lock_mutex(root);
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+				     0, ref_root, 0, ref_generation,
+				     owner_objectid, owner_offset);
+	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -813,9 +902,9 @@ int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int lookup_extent_ref(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u32 *refs)
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -846,7 +935,6 @@ out:
 	return 0;
 }
 
-
 static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 				u64 parent_gen, u64 ref_objectid,
 			        u64 *min_generation, u32 *ref_count)
@@ -863,7 +951,7 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	int ret;
 
 	key.objectid = bytenr;
-	key.offset = 0;
+	key.offset = (u64)-1;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
 	path = btrfs_alloc_path();
@@ -872,7 +960,10 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
+	if (ret < 0 || path->slots[0] == 0)
+		goto out;
 
+	path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
@@ -909,7 +1000,7 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 					  struct btrfs_extent_ref);
 		ref_generation = btrfs_ref_generation(leaf, ref_item);
 		/*
-		 * For (parent_gen > 0 && parent_gen > ref_gen):
+		 * For (parent_gen > 0 && parent_gen > ref_generation):
 		 *
 		 * we reach here through the oldest root, therefore
 		 * all other reference from same snapshot should have
@@ -919,8 +1010,7 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 		    (parent_gen > 0 && parent_gen > ref_generation) ||
 		    (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
 		     ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
-			if (ref_count)
-				*ref_count = 2;
+			*ref_count = 2;
 			break;
 		}
 
@@ -1020,80 +1110,29 @@ out:
 	return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int cache_ref)
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents)
 {
-	u64 bytenr;
 	u32 nritems;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int level;
-	int ret;
-	int faili;
-	int nr_file_extents = 0;
+	int ret = 0;
 
 	if (!root->ref_cows)
 		return 0;
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		if (level == 0) {
-			u64 disk_bytenr;
-			btrfs_item_key_to_cpu(buf, &key, i);
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-				continue;
-			fi = btrfs_item_ptr(buf, i,
-					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(buf, fi) ==
-			    BTRFS_FILE_EXTENT_INLINE)
-				continue;
-			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (disk_bytenr == 0)
-				continue;
-
-			if (buf != root->commit_root)
-				nr_file_extents++;
-
-			mutex_lock(&root->fs_info->alloc_mutex);
-			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf, fi),
-				    root->root_key.objectid, trans->transid,
-				    key.objectid, key.offset);
-			mutex_unlock(&root->fs_info->alloc_mutex);
-			if (ret) {
-				faili = i;
-				WARN_ON(1);
-				goto fail;
-			}
-		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
-			btrfs_node_key_to_cpu(buf, &key, i);
 
-			mutex_lock(&root->fs_info->alloc_mutex);
-			ret = __btrfs_inc_extent_ref(trans, root, bytenr,
-					   btrfs_level_size(root, level - 1),
-					   root->root_key.objectid,
-					   trans->transid,
-					   level - 1, key.objectid);
-			mutex_unlock(&root->fs_info->alloc_mutex);
-			if (ret) {
-				faili = i;
-				WARN_ON(1);
-				goto fail;
-			}
-		}
-	}
-	/* cache orignal leaf block's references */
-	if (level == 0 && cache_ref && buf != root->commit_root) {
+	if (level == 0) {
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
-		ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
+		ref = btrfs_alloc_leaf_ref(root, nr_extents);
 		if (!ref) {
-			WARN_ON(1);
+			ret = -ENOMEM;
 			goto out;
 		}
 
@@ -1101,10 +1140,10 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
-		ref->nritems = nr_file_extents;
+		ref->nritems = nr_extents;
 		info = ref->extents;
 
-		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
+		for (i = 0; nr_extents > 0 && i < nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1132,13 +1171,52 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		btrfs_free_leaf_ref(root, ref);
 	}
 out:
-	return 0;
-fail:
-	WARN_ON(1);
-#if 0
-	for (i =0; i < faili; i++) {
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents)
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	u32 nritems;
+	u32 nr_file_extents = 0;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int level;
+	int ret = 0;
+	int faili = 0;
+	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+
+	nritems = btrfs_header_nritems(buf);
+	level = btrfs_header_level(buf);
+
+	if (root->ref_cows) {
+		process_func = __btrfs_inc_extent_ref;
+	} else {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		process_func = __btrfs_update_extent_ref;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
 		if (level == 0) {
-			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
@@ -1147,24 +1225,131 @@ fail:
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (disk_bytenr == 0)
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
 				continue;
-			err = btrfs_free_extent(trans, root, disk_bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf,
-								      fi), 0);
-			BUG_ON(err);
+
+			nr_file_extents++;
+
+			maybe_lock_mutex(root);
+			ret = process_func(trans, root, bytenr,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   key.objectid, key.offset);
+			maybe_unlock_mutex(root);
+
+			if (ret) {
+				faili = i;
+				WARN_ON(1);
+				goto fail;
+			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
-			err = btrfs_free_extent(trans, root, bytenr,
-					btrfs_level_size(root, level - 1), 0);
-			BUG_ON(err);
+			maybe_lock_mutex(root);
+			ret = process_func(trans, root, bytenr,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   level - 1, 0);
+			maybe_unlock_mutex(root);
+			if (ret) {
+				faili = i;
+				WARN_ON(1);
+				goto fail;
+			}
 		}
 	}
-#endif
+out:
+	if (nr_extents) {
+		if (level == 0)
+			*nr_extents = nr_file_extents;
+		else
+			*nr_extents = nritems;
+	}
+	return 0;
+fail:
+	WARN_ON(1);
 	return ret;
 }
 
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr)
+
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int ret;
+	int slot;
+	int level;
+
+	BUG_ON(start_slot < 0);
+	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+	level = btrfs_header_level(buf);
+
+	if (!root->ref_cows) {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+	}
+
+	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+		cond_resched();
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, slot,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+			maybe_lock_mutex(root);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    key.objectid, key.offset);
+			maybe_unlock_mutex(root);
+			if (ret)
+				goto fail;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, slot);
+			maybe_lock_mutex(root);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    level - 1, 0);
+			maybe_unlock_mutex(root);
+			if (ret)
+				goto fail;
+		}
+	}
+	return 0;
+fail:
+	WARN_ON(1);
+	return -1;
+}
+
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
@@ -1539,19 +1724,18 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 {
 	u64 start;
 	u64 end;
+	u64 priv;
 	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct extent_buffer *eb;
 	struct btrfs_path *path;
-	struct btrfs_key ins;
-	struct btrfs_disk_key first;
+	struct btrfs_extent_ref *ref;
+	struct pending_extent_op *extent_op;
+	struct btrfs_key key;
 	struct btrfs_extent_item extent_item;
 	int ret;
-	int level;
 	int err = 0;
 
 	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
-	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	path = btrfs_alloc_path();
 
 	while(1) {
@@ -1560,37 +1744,54 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 		if (ret)
 			break;
 
-		ins.objectid = start;
-		ins.offset = end + 1 - start;
-		err = btrfs_insert_item(trans, extent_root, &ins,
+		ret = get_state_private(&info->extent_ins, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
+		if (extent_op->type == PENDING_EXTENT_INSERT) {
+			key.objectid = start;
+			key.offset = end + 1 - start;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			err = btrfs_insert_item(trans, extent_root, &key,
 					&extent_item, sizeof(extent_item));
-		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
-				  GFP_NOFS);
+			BUG_ON(err);
 
-		eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
-					   ins.offset);
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_LOCKED, GFP_NOFS);
 
-		if (!btrfs_buffer_uptodate(eb, trans->transid))
-			btrfs_read_buffer(eb, trans->transid);
+			err = insert_extent_backref(trans, extent_root, path,
+						start, extent_op->parent,
+						extent_root->root_key.objectid,
+						extent_op->generation,
+						extent_op->level, 0);
+			BUG_ON(err);
+		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+			err = lookup_extent_backref(trans, extent_root, path,
+						start, extent_op->orig_parent,
+						extent_root->root_key.objectid,
+						extent_op->orig_generation, 0);
+			BUG_ON(err);
 
-		btrfs_tree_lock(eb);
-		level = btrfs_header_level(eb);
-		if (level == 0) {
-			btrfs_item_key(eb, &first, 0);
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_LOCKED, GFP_NOFS);
+
+			key.objectid = start;
+			key.offset = extent_op->parent;
+			key.type = BTRFS_EXTENT_REF_KEY;
+			err = btrfs_set_item_key_safe(trans, extent_root, path,
+						      &key);
+			BUG_ON(err);
+			ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					     struct btrfs_extent_ref);
+			btrfs_set_ref_generation(path->nodes[0], ref,
+						 extent_op->generation);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+			btrfs_release_path(extent_root, path);
 		} else {
-			btrfs_node_key(eb, &first, 0);
+			BUG_ON(1);
 		}
-		btrfs_tree_unlock(eb);
-		free_extent_buffer(eb);
-		/*
-		 * the first key is just a hint, so the race we've created
-		 * against reading it is fine
-		 */
-		err = btrfs_insert_extent_backref(trans, extent_root, path,
-					  start, extent_root->root_key.objectid,
-					  0, level,
-					  btrfs_disk_key_objectid(&first));
-		BUG_ON(err);
+		kfree(extent_op);
+
 		if (need_resched()) {
 			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
@@ -1601,52 +1802,44 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
-			  int is_data, int pending)
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u64 num_bytes, int is_data)
 {
 	int err = 0;
+	struct extent_buffer *buf;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
-	if (!pending) {
-		struct extent_buffer *buf;
-
-		if (is_data)
-			goto pinit;
-
-		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-		if (buf) {
-			/* we can reuse a block if it hasn't been written
-			 * and it is from this transaction.  We can't
-			 * reuse anything from the tree log root because
-			 * it has tiny sub-transactions.
-			 */
-			if (btrfs_buffer_uptodate(buf, 0) &&
-			    btrfs_try_tree_lock(buf)) {
-				u64 transid =
-				    root->fs_info->running_transaction->transid;
-				u64 header_transid =
-					btrfs_header_generation(buf);
-				if (btrfs_header_owner(buf) !=
-				    BTRFS_TREE_LOG_OBJECTID &&
-				    header_transid == transid &&
-				    !btrfs_header_flag(buf,
-					       BTRFS_HEADER_FLAG_WRITTEN)) {
-					clean_tree_block(NULL, root, buf);
-					btrfs_tree_unlock(buf);
-					free_extent_buffer(buf);
-					return 1;
-				}
-				btrfs_tree_unlock(buf);
-			}
+	if (is_data)
+		goto pinit;
+
+	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+	if (!buf)
+		goto pinit;
+
+	/* we can reuse a block if it hasn't been written
+	 * and it is from this transaction.  We can't
+	 * reuse anything from the tree log root because
+	 * it has tiny sub-transactions.
+	 */
+	if (btrfs_buffer_uptodate(buf, 0) &&
+	    btrfs_try_tree_lock(buf)) {
+		u64 header_owner = btrfs_header_owner(buf);
+		u64 header_transid = btrfs_header_generation(buf);
+		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+		    header_transid == trans->transid &&
+		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+			clean_tree_block(NULL, root, buf);
+			btrfs_tree_unlock(buf);
 			free_extent_buffer(buf);
+			return 1;
 		}
-pinit:
-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-	} else {
-		set_extent_bits(&root->fs_info->pending_del,
-				bytenr, bytenr + num_bytes - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+		btrfs_tree_unlock(buf);
 	}
+	free_extent_buffer(buf);
+pinit:
+	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+
 	BUG_ON(err < 0);
 	return 0;
 }
@@ -1654,11 +1847,12 @@ pinit:
 /*
  * remove an extent from the root, returns 0 on success
  */
-static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 bytenr, u64 num_bytes,
+static int __free_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, u64 owner_offset, int pin,
-			 int mark_free)
+			 u64 owner_objectid, u64 owner_offset,
+			 int pin, int mark_free)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1681,10 +1875,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		return -ENOMEM;
 
 	path->reada = 1;
-	ret = lookup_extent_backref(trans, extent_root, path,
-				    bytenr, root_objectid,
-				    ref_generation,
-				    owner_objectid, owner_offset, 1);
+	ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent,
+				    root_objectid, ref_generation, 1);
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
@@ -1702,8 +1894,15 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			if (path->slots[0] - extent_slot > 5)
 				break;
 		}
-		if (!found_extent)
-			ret = btrfs_del_item(trans, extent_root, path);
+		if (!found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path);
+			BUG_ON(ret);
+			btrfs_release_path(extent_root, path);
+			ret = btrfs_search_slot(trans, extent_root,
+						&key, path, -1, 1);
+			BUG_ON(ret);
+			extent_slot = path->slots[0];
+		}
 	} else {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
@@ -1712,14 +1911,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		       root_objectid, ref_generation, owner_objectid,
 		       owner_offset);
 	}
-	if (!found_extent) {
-		btrfs_release_path(extent_root, path);
-		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-		if (ret < 0)
-			return ret;
-		BUG_ON(ret);
-		extent_slot = path->slots[0];
-	}
 
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, extent_slot,
@@ -1732,6 +1923,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+		struct btrfs_extent_ref *ref;
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
 		/* if the back ref and the extent are next to each other
 		 * they get deleted below in one shot
 		 */
@@ -1739,15 +1934,13 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_to_del = 2;
 	} else if (found_extent) {
 		/* otherwise delete the extent back ref */
-		ret = btrfs_del_item(trans, extent_root, path);
+		ret = remove_extent_backref(trans, extent_root, path);
 		BUG_ON(ret);
 		/* if refs are 0, we need to setup the path for deletion */
 		if (refs == 0) {
 			btrfs_release_path(extent_root, path);
 			ret = btrfs_search_slot(trans, extent_root, &key, path,
 						-1, 1);
-			if (ret < 0)
-				return ret;
 			BUG_ON(ret);
 		}
 	}
@@ -1761,8 +1954,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 #endif
 
 		if (pin) {
-			ret = pin_down_bytes(root, bytenr, num_bytes,
-			     owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 0);
+			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1781,9 +1974,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 					   root_used - num_bytes);
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		if (ret) {
-			return ret;
-		}
+		BUG_ON(ret);
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
@@ -1822,33 +2013,61 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 {
 	int ret;
 	int err = 0;
+	int mark_free = 0;
 	u64 start;
 	u64 end;
+	u64 priv;
 	struct extent_io_tree *pending_del;
-	struct extent_io_tree *pinned_extents;
+	struct extent_io_tree *extent_ins;
+	struct pending_extent_op *extent_op;
 
 	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+	extent_ins = &extent_root->fs_info->extent_ins;
 	pending_del = &extent_root->fs_info->pending_del;
-	pinned_extents = &extent_root->fs_info->pinned_extents;
 
 	while(1) {
 		ret = find_first_extent_bit(pending_del, 0, &start, &end,
 					    EXTENT_LOCKED);
 		if (ret)
 			break;
+
+		ret = get_state_private(pending_del, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
 		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
-		if (!test_range_bit(&extent_root->fs_info->extent_ins,
-				    start, end, EXTENT_LOCKED, 0)) {
-			btrfs_update_pinned_extents(extent_root, start,
-					      end + 1 - start, 1);
+
+		ret = pin_down_bytes(trans, extent_root, start,
+				     end + 1 - start, 0);
+		mark_free = ret > 0;
+		if (!test_range_bit(extent_ins, start, end,
+				    EXTENT_LOCKED, 0)) {
+free_extent:
 			ret = __free_extent(trans, extent_root,
-					     start, end + 1 - start,
-					     extent_root->root_key.objectid,
-					     0, 0, 0, 0, 0);
+					    start, end + 1 - start,
+					    extent_op->orig_parent,
+					    extent_root->root_key.objectid,
+					    extent_op->orig_generation,
+					    extent_op->level, 0, 0, mark_free);
+			kfree(extent_op);
 		} else {
-			clear_extent_bits(&extent_root->fs_info->extent_ins,
-					  start, end, EXTENT_LOCKED, GFP_NOFS);
+			kfree(extent_op);
+			ret = get_state_private(extent_ins, start, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+							(unsigned long)priv;
+
+			clear_extent_bits(extent_ins, start, end,
+					  EXTENT_LOCKED, GFP_NOFS);
+
+			if (extent_op->type == PENDING_BACKREF_UPDATE)
+				goto free_extent;
+
+			ret = update_block_group(trans, extent_root, start,
+						end + 1 - start, 0, mark_free);
+			BUG_ON(ret);
+			kfree(extent_op);
 		}
 		if (ret)
 			err = ret;
@@ -1866,21 +2085,36 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
  * remove an extent from the root, returns 0 on success
  */
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root, u64 bytenr,
-			       u64 num_bytes, u64 root_objectid,
-			       u64 ref_generation, u64 owner_objectid,
-			       u64 owner_offset, int pin)
+			       struct btrfs_root *root,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 root_objectid, u64 ref_generation,
+			       u64 owner_objectid, u64 owner_offset, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
 	int ret;
 
 	WARN_ON(num_bytes < root->sectorsize);
-	if (!root->ref_cows)
-		ref_generation = 0;
-
 	if (root == extent_root) {
-		pin_down_bytes(root, bytenr, num_bytes, 0, 1);
+		struct pending_extent_op *extent_op;
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_DELETE;
+		extent_op->bytenr = bytenr;
+		extent_op->num_bytes = num_bytes;
+		extent_op->parent = parent;
+		extent_op->orig_parent = parent;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = ref_generation;
+		extent_op->level = (int)owner_objectid;
+
+		set_extent_bits(&root->fs_info->pending_del,
+				bytenr, bytenr + num_bytes - 1,
+				EXTENT_LOCKED, GFP_NOFS);
+		set_state_private(&root->fs_info->pending_del,
+				  bytenr, (unsigned long)extent_op);
 		return 0;
 	}
 	/* if metadata always pin */
@@ -1901,9 +2135,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (ref_generation != trans->transid)
 		pin = 1;
 
-	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
-			    ref_generation, owner_objectid, owner_offset,
-			    pin, pin == 0);
+	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+			    root_objectid, ref_generation, owner_objectid,
+			    owner_offset, pin, pin == 0);
 
 	finish_current_insert(trans, root->fs_info->extent_root);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
@@ -1911,15 +2145,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, u64 bytenr,
-		      u64 num_bytes, u64 root_objectid,
-		      u64 ref_generation, u64 owner_objectid,
-		      u64 owner_offset, int pin)
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, u64 owner_offset, int pin)
 {
 	int ret;
 
 	maybe_lock_mutex(root);
-	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes,
+	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
 				  owner_objectid, owner_offset, pin);
 	maybe_unlock_mutex(root);
@@ -2271,7 +2505,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root,
+					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
 					 u64 owner, u64 owner_offset,
 					 struct btrfs_key *ins)
@@ -2289,6 +2523,9 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
+	if (parent == 0)
+		parent = ins->objectid;
+
 	/* block accounting for super block */
 	spin_lock_irq(&info->delalloc_lock);
 	super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2300,17 +2537,32 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
 	if (root == extent_root) {
+		struct pending_extent_op *extent_op;
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_INSERT;
+		extent_op->bytenr = ins->objectid;
+		extent_op->num_bytes = ins->offset;
+		extent_op->parent = parent;
+		extent_op->orig_parent = 0;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = 0;
+		extent_op->level = (int)owner;
+
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
 				EXTENT_LOCKED, GFP_NOFS);
+		set_state_private(&root->fs_info->extent_ins,
+				  ins->objectid, (unsigned long)extent_op);
 		goto update_block;
 	}
 
 	memcpy(&keys[0], ins, sizeof(*ins));
-	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
-					 owner, owner_offset);
 	keys[1].objectid = ins->objectid;
 	keys[1].type = BTRFS_EXTENT_REF_KEY;
+	keys[1].offset = parent;
 	sizes[0] = sizeof(*extent_item);
 	sizes[1] = sizeof(*ref);
 
@@ -2331,6 +2583,7 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
 	btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
@@ -2359,7 +2612,7 @@ out:
 }
 
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins)
@@ -2369,9 +2622,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
 	maybe_lock_mutex(root);
-	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-					    ref_generation, owner,
-					    owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					    root_objectid, ref_generation,
+					    owner, owner_offset, ins);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2382,7 +2635,7 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
  * space cache bits as well
  */
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins)
@@ -2396,10 +2649,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
 	BUG_ON(ret);
-
-	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-					    ref_generation, owner,
-					    owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					    root_objectid, ref_generation,
+					    owner, owner_offset, ins);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2413,9 +2665,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_alloc_size,
+		       u64 num_bytes, u64 parent, u64 min_alloc_size,
 		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
+		       u64 owner_objectid, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
@@ -2428,9 +2680,9 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 				     search_end, ins, data);
 	BUG_ON(ret);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-						    ref_generation, owner,
-						    owner_offset, ins);
+		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					root_objectid, ref_generation,
+					owner_objectid, owner_offset, ins);
 		BUG_ON(ret);
 
 	}
@@ -2468,10 +2720,9 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
-					     u32 blocksize,
+					     u32 blocksize, u64 parent,
 					     u64 root_objectid,
 					     u64 ref_generation,
-					     u64 first_objectid,
 					     int level,
 					     u64 hint,
 					     u64 empty_size)
@@ -2480,10 +2731,9 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
-				 root_objectid, ref_generation,
-				 level, first_objectid, empty_size, hint,
-				 (u64)-1, &ins, 0);
+	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+				 root_objectid, ref_generation, level, 0,
+				 empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -2531,15 +2781,14 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
-				leaf_owner, leaf_generation,
+				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, key.offset, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
+		BUG_ON(ret);
 
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
 		cond_resched();
-
-		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -2554,10 +2803,10 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 	for (i = 0; i < ref->nritems; i++) {
 		mutex_lock(&root->fs_info->alloc_mutex);
-		ret = __btrfs_free_extent(trans, root,
-					info->bytenr, info->num_bytes,
-					ref->owner, ref->generation,
-					info->objectid, info->offset, 0);
+		ret = __btrfs_free_extent(trans, root, info->bytenr,
+					  info->num_bytes, ref->bytenr,
+					  ref->owner, ref->generation,
+					  info->objectid, info->offset, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2576,7 +2825,7 @@ int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 {
 	int ret;
 
-	ret = lookup_extent_ref(NULL, root, start, len, refs);
+	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 // some debugging code in case we see problems here
@@ -2672,8 +2921,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
-						blocksize, root_owner,
-						root_gen, 0, 0, 1);
+						blocksize, parent->start,
+						root_owner, root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 
@@ -2690,8 +2939,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		 * So, we don't need to check it again
 		 */
 		if (*level == 1) {
-			struct btrfs_key key;
-			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
 			if (ref) {
 				ret = cache_drop_leaf_ref(trans, root, ref);
@@ -2750,12 +2997,13 @@ out:
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
-				  root_owner, root_gen, 0, 0, 1);
+				  parent->start, root_owner, root_gen,
+				  0, 0, 1);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	cond_resched();
 	return 0;
@@ -2792,19 +3040,18 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 			root_item->drop_level = i;
 			return 0;
 		} else {
-			if (path->nodes[*level] == root->node) {
-				root_owner = root->root_key.objectid;
-				root_gen =
-				   btrfs_header_generation(path->nodes[*level]);
-			} else {
-				struct extent_buffer *node;
-				node = path->nodes[*level + 1];
-				root_owner = btrfs_header_owner(node);
-				root_gen = btrfs_header_generation(node);
-			}
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len,
+						parent->start,
 						root_owner, root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			free_extent_buffer(path->nodes[*level]);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8624f3e8803..58ad25838a4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2201,9 +2201,10 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
-
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 	if (wbc->range_cont)
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
+#endif
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58b329ddb42..48a702d41c8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -524,6 +524,9 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
+	u64 leaf_start;
+	u64 root_gen;
+	u64 root_owner;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_path *path;
@@ -562,6 +565,9 @@ next_slot:
 		bookend = 0;
 		found_extent = 0;
 		found_inline = 0;
+		leaf_start = 0;
+		root_gen = 0;
+		root_owner = 0;
 		extent = NULL;
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -628,27 +634,18 @@ next_slot:
 			search_start = extent_end;
 		if (end <= extent_end && start >= key.offset && found_inline) {
 			*hint_byte = EXTENT_MAP_INLINE;
-			continue;
+			goto out;
+		}
+
+		if (found_extent) {
+			read_extent_buffer(leaf, &old, (unsigned long)extent,
+					   sizeof(old));
+			root_gen = btrfs_header_generation(leaf);
+			root_owner = btrfs_header_owner(leaf);
+			leaf_start = leaf->start;
 		}
+
 		if (end < extent_end && end >= key.offset) {
-			if (found_extent) {
-				u64 disk_bytenr =
-				    btrfs_file_extent_disk_bytenr(leaf, extent);
-				u64 disk_num_bytes =
-				    btrfs_file_extent_disk_num_bytes(leaf,
-								      extent);
-				read_extent_buffer(leaf, &old,
-						   (unsigned long)extent,
-						   sizeof(old));
-				if (disk_bytenr != 0) {
-					ret = btrfs_inc_extent_ref(trans, root,
-					         disk_bytenr, disk_num_bytes,
-						 root->root_key.objectid,
-						 trans->transid,
-						 key.objectid, end);
-					BUG_ON(ret);
-				}
-			}
 			bookend = 1;
 			if (found_inline && start <= key.offset)
 				keep = 1;
@@ -687,49 +684,12 @@ next_slot:
 		}
 		/* delete the entire extent */
 		if (!keep) {
-			u64 disk_bytenr = 0;
-			u64 disk_num_bytes = 0;
-			u64 extent_num_bytes = 0;
-			u64 root_gen;
-			u64 root_owner;
-
-			root_gen = btrfs_header_generation(leaf);
-			root_owner = btrfs_header_owner(leaf);
-			if (found_extent) {
-				disk_bytenr =
-				      btrfs_file_extent_disk_bytenr(leaf,
-								     extent);
-				disk_num_bytes =
-				      btrfs_file_extent_disk_num_bytes(leaf,
-								       extent);
-				extent_num_bytes =
-				      btrfs_file_extent_num_bytes(leaf, extent);
-				*hint_byte =
-					btrfs_file_extent_disk_bytenr(leaf,
-								      extent);
-			}
 			ret = btrfs_del_item(trans, root, path);
 			/* TODO update progress marker and return */
 			BUG_ON(ret);
-			btrfs_release_path(root, path);
 			extent = NULL;
-			if (found_extent && disk_bytenr != 0) {
-				dec_i_blocks(inode, extent_num_bytes);
-				ret = btrfs_free_extent(trans, root,
-						disk_bytenr,
-						disk_num_bytes,
-						root_owner,
-						root_gen, inode->i_ino,
-						key.offset, 0);
-			}
-
-			BUG_ON(ret);
-			if (!bookend && search_start >= end) {
-				ret = 0;
-				goto out;
-			}
-			if (!bookend)
-				continue;
+			btrfs_release_path(root, path);
+			/* the extent will be freed later */
 		}
 		if (bookend && found_inline && start <= key.offset) {
 			u32 new_size;
@@ -737,10 +697,13 @@ next_slot:
 						   extent_end - end);
 			dec_i_blocks(inode, (extent_end - key.offset) -
 					(extent_end - end));
-			btrfs_truncate_item(trans, root, path, new_size, 0);
+			ret = btrfs_truncate_item(trans, root, path,
+						  new_size, 0);
+			BUG_ON(ret);
 		}
 		/* create bookend, splitting the extent in two */
 		if (bookend && found_extent) {
+			u64 disk_bytenr;
 			struct btrfs_key ins;
 			ins.objectid = inode->i_ino;
 			ins.offset = end;
@@ -748,13 +711,9 @@ next_slot:
 			btrfs_release_path(root, path);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
+			BUG_ON(ret);
 
 			leaf = path->nodes[0];
-			if (ret) {
-				btrfs_print_leaf(root, leaf);
-				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.type, ins.offset, start, end, key.offset, extent_end, keep);
-			}
-			BUG_ON(ret);
 			extent = btrfs_item_ptr(leaf, path->slots[0],
 						struct btrfs_file_extent_item);
 			write_extent_buffer(leaf, &old,
@@ -770,11 +729,43 @@ next_slot:
 						   BTRFS_FILE_EXTENT_REG);
 
 			btrfs_mark_buffer_dirty(path->nodes[0]);
-			if (le64_to_cpu(old.disk_bytenr) != 0) {
+
+			disk_bytenr = le64_to_cpu(old.disk_bytenr);
+			if (disk_bytenr != 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						ins.objectid, ins.offset);
+				BUG_ON(ret);
+			}
+			btrfs_release_path(root, path);
+			if (disk_bytenr != 0) {
 				inode->i_blocks +=
 				      btrfs_file_extent_num_bytes(leaf,
 								  extent) >> 9;
 			}
+		}
+
+		if (found_extent && !keep) {
+			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
+
+			if (disk_bytenr != 0) {
+				dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
+				ret = btrfs_free_extent(trans, root,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						leaf_start, root_owner,
+						root_gen, key.objectid,
+						key.offset, 0);
+				BUG_ON(ret);
+				*hint_byte = disk_bytenr;
+			}
+		}
+
+		if (search_start >= end) {
 			ret = 0;
 			goto out;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65b4f864b0d..2e7d82ec5d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -528,6 +528,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_file_extent_item *extent_item;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
 	u64 alloc_hint = 0;
 	struct list_head list;
 	struct btrfs_key ins;
@@ -544,20 +547,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
 
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
 	lock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
 
 	INIT_LIST_HEAD(&list);
 
-	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->len;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-	ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
-					  trans->transid, inode->i_ino,
-					  ordered_extent->file_offset, &ins);
-	BUG_ON(ret);
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
 	ret = btrfs_drop_extents(trans, root, inode,
@@ -566,18 +564,42 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				 ordered_extent->len,
 				 ordered_extent->file_offset, &alloc_hint);
 	BUG_ON(ret);
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       ordered_extent->file_offset,
-				       ordered_extent->start,
-				       ordered_extent->len,
-				       ordered_extent->len, 0);
+
+	ins.objectid = inode->i_ino;
+	ins.offset = ordered_extent->file_offset;
+	ins.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &ins,
+				      sizeof(*extent_item));
 	BUG_ON(ret);
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
+					  ordered_extent->start);
+	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
+					     ordered_extent->len);
+	btrfs_set_file_extent_offset(leaf, extent_item, 0);
+	btrfs_set_file_extent_num_bytes(leaf, extent_item,
+					ordered_extent->len);
+	btrfs_mark_buffer_dirty(leaf);
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
+	ins.objectid = ordered_extent->start;
+	ins.offset = ordered_extent->len;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+					  root->root_key.objectid,
+					  trans->transid, inode->i_ino,
+					  ordered_extent->file_offset, &ins);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
 	inode->i_blocks += ordered_extent->len >> 9;
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
@@ -596,6 +618,8 @@ nocow:
 	btrfs_put_ordered_extent(ordered_extent);
 
 	btrfs_end_transaction(trans, root);
+	if (path)
+		btrfs_free_path(path);
 	return 0;
 }
 
@@ -1433,10 +1457,7 @@ search_again:
 					if (root->ref_cows)
 						dec_i_blocks(inode, num_dec);
 				}
-				if (root->ref_cows) {
-					root_gen =
-						btrfs_header_generation(leaf);
-				}
+				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1477,7 +1498,7 @@ delete:
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
-						root_owner,
+						leaf->start, root_owner,
 						root_gen, inode->i_ino,
 						found_key.offset, 0);
 			BUG_ON(ret);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f84b5f6991c..4c6e0c15754 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -76,9 +76,8 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	if (ret)
 		goto fail;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      objectid, trans->transid, 0, 0,
-				      0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      objectid, trans->transid, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
@@ -525,13 +524,10 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	struct file *src_file;
 	struct inode *src;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_ordered_extent *ordered;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	char *buf;
 	struct btrfs_key key;
-	struct btrfs_key new_key;
-	u32 size;
 	u32 nritems;
 	int slot;
 	int ret;
@@ -576,6 +572,7 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
+		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
@@ -619,6 +616,32 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		    key.objectid != src->i_ino)
 			break;
 
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
+		    btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+			u32 size;
+			struct btrfs_key new_key;
+
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+			btrfs_release_path(root, path);
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = inode->i_ino;
+			ret = btrfs_insert_empty_item(trans, root, path,
+						      &new_key, size);
+			if (ret)
+				goto out;
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			struct btrfs_file_extent_item *extent;
 			int found_type;
@@ -634,31 +657,15 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 				/* ds == 0 means there's a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, root,
-						     ds, dl,
+						     ds, dl, leaf->start,
 						     root->root_key.objectid,
 						     trans->transid,
 						     inode->i_ino, key.offset);
-					if (ret)
-						goto out;
+					BUG_ON(ret);
 				}
 			}
 		}
-
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
-		    btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			size = btrfs_item_size_nr(leaf, slot);
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
-			btrfs_release_path(root, path);
-			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = inode->i_ino;
-			ret = btrfs_insert_item(trans, root, &new_key,
-						buf, size);
-			BUG_ON(ret);
-		} else {
-			btrfs_release_path(root, path);
-		}
+		btrfs_release_path(root, path);
 		key.offset++;
 	}
 	ret = 0;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f1374d597a1..3577badfa5b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -102,11 +102,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
 			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu offset %llu\n",
+			       "owner %llu offset %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
-			       (unsigned long long)btrfs_ref_offset(l, ref));
+			       (unsigned long long)btrfs_ref_offset(l, ref),
+			       (unsigned long)btrfs_ref_num_refs(l, ref));
 			break;
 
 		case BTRFS_EXTENT_DATA_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 56de3fb2d8d..88bbfd959f1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -89,9 +89,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 objectid = root->root_key.objectid;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 				      BTRFS_TREE_LOG_OBJECTID,
-				      0, 0, 0, 0, 0);
+				      trans->transid, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		return ret;
@@ -433,6 +433,49 @@ insert:
 						   trans->transid);
 		}
 	}
+
+	if (overwrite_root &&
+	    key->type == BTRFS_EXTENT_DATA_KEY) {
+		int extent_type;
+		struct btrfs_file_extent_item *fi;
+
+		fi = (struct btrfs_file_extent_item *)dst_ptr;
+		extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+			struct btrfs_key ins;
+			ins.objectid = btrfs_file_extent_disk_bytenr(
+							path->nodes[0], fi);
+			ins.offset = btrfs_file_extent_disk_num_bytes(
+							path->nodes[0], fi);
+			ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_extent(root, ins.objectid,
+						  ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid,
+						key->objectid, key->offset);
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_extent(trans, root,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						key->offset, &ins);
+				BUG_ON(ret);
+			}
+		}
+	}
 no_copy:
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
@@ -551,45 +594,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			 start, extent_end, start, &alloc_hint);
 	BUG_ON(ret);
 
+	/* insert the extent */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
 	BUG_ON(ret);
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
-		struct btrfs_key ins;
-
-		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
-		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
-		ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-		/* insert the extent pointer in the file */
-		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
 
-		/*
-		 * is this extent already allocated in the extent
-		 * allocation tree?  If so, just add a reference
-		 */
-		ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset);
-		btrfs_release_path(root, path);
-		if (ret == 0) {
-			ret = btrfs_inc_extent_ref(trans, root,
-				   ins.objectid, ins.offset,
-				   root->root_key.objectid,
-				   trans->transid, key->objectid, start);
-		} else {
-			/*
-			 * insert the extent pointer in the extent
-			 * allocation tree
-			 */
-			ret = btrfs_alloc_logged_extent(trans, root,
-						root->root_key.objectid,
-						trans->transid, key->objectid,
-						start, &ins);
-			BUG_ON(ret);
-		}
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		/* inline extents are easy, we just overwrite them */
-		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
-	}
 	/* btrfs_drop_extents changes i_blocks, update it here */
 	inode->i_blocks += (extent_end - start) >> 9;
 	btrfs_update_inode(trans, root, inode);
@@ -1806,16 +1814,14 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 			WARN_ON(*level == 0);
 			return 0;
 		} else {
-			if (path->nodes[*level] == root->node) {
-				root_owner = root->root_key.objectid;
-				root_gen =
-				   btrfs_header_generation(path->nodes[*level]);
-			} else {
-				struct extent_buffer *node;
-				node = path->nodes[*level + 1];
-				root_owner = btrfs_header_owner(node);
-				root_gen = btrfs_header_generation(node);
-			}
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
 			wc->process_func(root, path->nodes[*level], wc,
 				 btrfs_header_generation(path->nodes[*level]));
 			if (wc->free) {
@@ -2525,8 +2531,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
 						   ds, dl,
+						   dst_path->nodes[0]->start,
 						   BTRFS_TREE_LOG_OBJECTID,
-						   0, ins_keys[i].objectid,
+						   trans->transid,
+						   ins_keys[i].objectid,
 						   ins_keys[i].offset);
 					BUG_ON(ret);
 				}
-- 
cgit v1.2.3-70-g09d2


From a76a3cd40c1127ca199d4f7f37bf0d541bf44eb2 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 9 Oct 2008 11:46:29 -0400
Subject: Btrfs: Count space allocated to file in bytes

This patch makes btrfs count space allocated to file in bytes instead
of 512 byte sectors.

Everything else in btrfs uses a byte count instead of sector sizes or
blocks sizes, so this fits better.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       | 13 ++-----------
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/file.c        | 24 +++++++++++++-----------
 fs/btrfs/inode.c       | 23 ++++++++++++-----------
 fs/btrfs/ioctl.c       |  4 ++--
 fs/btrfs/tree-log.c    |  6 +++---
 6 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8566eb30f56..50fbcc9ec45 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -411,7 +411,7 @@ struct btrfs_inode_item {
 	/* transid that last touched this inode */
 	__le64 transid;
 	__le64 size;
-	__le64 nblocks;
+	__le64 nbytes;
 	__le64 block_group;
 	__le32 nlink;
 	__le32 uid;
@@ -1017,7 +1017,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
 BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
 BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
 BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
@@ -1814,15 +1814,6 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio);
 
-static inline void dec_i_blocks(struct inode *inode, u64 dec)
-{
-	dec = dec >> 9;
-	if (dec <= inode->i_blocks)
-		inode->i_blocks -= dec;
-	else
-		inode->i_blocks = 0;
-}
-
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 29380467b67..69db54e09fb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3930,7 +3930,7 @@ next:
 				BUG_ON(ret);
 				btrfs_release_path(root, path);
 
-				inode->i_blocks += extent_len >> 9;
+				inode_add_bytes(inode, extent_len);
 
 				ext_offset = 0;
 				num_bytes -= extent_len;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a03d1bbb19a..18dfdf5f91d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -193,7 +193,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 			leaf = path->nodes[0];
 			ei = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
-			inode->i_blocks += (offset + size - found_end) >> 9;
+			inode_add_bytes(inode, offset + size - found_end);
 		}
 		if (found_end < offset) {
 			ptr = btrfs_file_extent_inline_start(ei) + found_size;
@@ -203,7 +203,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 insert:
 		btrfs_release_path(root, path);
 		datasize = offset + size - key.offset;
-		inode->i_blocks += datasize >> 9;
+		inode_add_bytes(inode, datasize);
 		datasize = btrfs_file_extent_calc_inline_size(datasize);
 		ret = btrfs_insert_empty_item(trans, root, path, &key,
 					      datasize);
@@ -713,7 +713,8 @@ next_slot:
 								      extent);
 				if (btrfs_file_extent_disk_bytenr(leaf,
 								  extent)) {
-					dec_i_blocks(inode, old_num - new_num);
+					inode_sub_bytes(inode, old_num -
+							new_num);
 				}
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								new_num);
@@ -724,14 +725,17 @@ next_slot:
 				u32 new_size;
 				new_size = btrfs_file_extent_calc_inline_size(
 						   inline_limit - key.offset);
-				dec_i_blocks(inode, (extent_end - key.offset) -
-					(inline_limit - key.offset));
+				inode_sub_bytes(inode, extent_end -
+						inline_limit);
 				btrfs_truncate_item(trans, root, path,
 						    new_size, 1);
 			}
 		}
 		/* delete the entire extent */
 		if (!keep) {
+			if (found_inline)
+				inode_sub_bytes(inode, extent_end -
+						key.offset);
 			ret = btrfs_del_item(trans, root, path);
 			/* TODO update progress marker and return */
 			BUG_ON(ret);
@@ -743,8 +747,7 @@ next_slot:
 			u32 new_size;
 			new_size = btrfs_file_extent_calc_inline_size(
 						   extent_end - end);
-			dec_i_blocks(inode, (extent_end - key.offset) -
-					(extent_end - end));
+			inode_sub_bytes(inode, end - key.offset);
 			ret = btrfs_truncate_item(trans, root, path,
 						  new_size, 0);
 			BUG_ON(ret);
@@ -791,9 +794,7 @@ next_slot:
 			}
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0) {
-				inode->i_blocks +=
-				      btrfs_file_extent_num_bytes(leaf,
-								  extent) >> 9;
+				inode_add_bytes(inode, extent_end - end);
 			}
 		}
 
@@ -801,7 +802,8 @@ next_slot:
 			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
 
 			if (disk_bytenr != 0) {
-				dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
+				inode_sub_bytes(inode,
+						le64_to_cpu(old.num_bytes));
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff0c3597665..f9df89c5fdf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -652,7 +652,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
-	inode->i_blocks += ordered_extent->len >> 9;
+	inode_add_bytes(inode, ordered_extent->len);
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
@@ -1104,7 +1104,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 
-	inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item);
+	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
@@ -1184,7 +1184,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
 				inode->i_ctime.tv_nsec);
 
-	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
+	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
 	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
@@ -1679,7 +1679,7 @@ search_again:
 				num_dec = (orig_num_bytes -
 					   extent_num_bytes);
 				if (root->ref_cows && extent_start != 0)
-					dec_i_blocks(inode, num_dec);
+					inode_sub_bytes(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				extent_num_bytes =
@@ -1690,7 +1690,7 @@ search_again:
 				if (extent_start != 0) {
 					found_extent = 1;
 					if (root->ref_cows)
-						dec_i_blocks(inode, num_dec);
+						inode_sub_bytes(inode, num_dec);
 				}
 				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
@@ -1700,8 +1700,8 @@ search_again:
 				u32 size = new_size - found_key.offset;
 
 				if (root->ref_cows) {
-					dec_i_blocks(inode, item_end + 1 -
-						    found_key.offset - size);
+					inode_sub_bytes(inode, item_end + 1 -
+							new_size);
 				}
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
@@ -1709,8 +1709,8 @@ search_again:
 							  size, 1);
 				BUG_ON(ret);
 			} else if (root->ref_cows) {
-				dec_i_blocks(inode, item_end + 1 -
-					     found_key.offset);
+				inode_sub_bytes(inode, item_end + 1 -
+						found_key.offset);
 			}
 		}
 delete:
@@ -2514,7 +2514,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_gid = current->fsgid;
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
-	inode->i_blocks = 0;
+	inode_set_bytes(inode, 0);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
@@ -3557,7 +3557,8 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = PAGE_CACHE_SIZE;
-	stat->blocks = inode->i_blocks + (BTRFS_I(inode)->delalloc_bytes >> 9);
+	stat->blocks = (inode_get_bytes(inode) +
+			BTRFS_I(inode)->delalloc_bytes) >> 9;
 	return 0;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 04de767a8db..ab7a0f61ded 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -99,7 +99,7 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	inode_item->generation = cpu_to_le64(1);
 	inode_item->size = cpu_to_le64(3);
 	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
@@ -671,7 +671,7 @@ out:
 	btrfs_release_path(root, path);
 	if (ret == 0) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_blocks = src->i_blocks;
+		inode_set_bytes(inode, inode_get_bytes(src));
 		btrfs_i_size_write(inode, src->i_size);
 		BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
 		ret = btrfs_update_inode(trans, root, inode);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 88bbfd959f1..8df719a73d8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -113,7 +113,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->generation = cpu_to_le64(1);
 	inode_item->size = cpu_to_le64(3);
 	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
@@ -598,8 +598,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 	BUG_ON(ret);
 
-	/* btrfs_drop_extents changes i_blocks, update it here */
-	inode->i_blocks += (extent_end - start) >> 9;
+	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
+	inode_add_bytes(inode, extent_end - start);
 	btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
-- 
cgit v1.2.3-70-g09d2


From 3bb1a1bc42f2ae9582c28adf620484efcd4da38d Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 9 Oct 2008 11:46:24 -0400
Subject: Btrfs: Remove offset field from struct btrfs_extent_ref

The offset field in struct btrfs_extent_ref records the position
inside file that file extent is referenced by. In the new back
reference system, tree leaves holding references to file extent
are recorded explicitly. We can scan these tree leaves very quickly, so the
offset field is not required.

This patch also makes the back reference system check the objectid
when extents are in deleting.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c       |  24 +++----
 fs/btrfs/ctree.h       |  25 ++-----
 fs/btrfs/extent-tree.c | 185 +++++++++++++++++++++++--------------------------
 fs/btrfs/file.c        |   6 +-
 fs/btrfs/inode.c       |   6 +-
 fs/btrfs/ioctl.c       |   2 +-
 fs/btrfs/print-tree.c  |   3 +-
 fs/btrfs/tree-log.c    |   8 +--
 8 files changed, 115 insertions(+), 144 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2eab4643dcb..9caeb377de6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,8 +254,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
 						  root->root_key.objectid,
-						  trans->transid, level, 0,
-						  &ins);
+						  trans->transid, level, &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
 					    buf->len);
@@ -333,7 +332,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					  buf->len, buf->start,
 					  root->root_key.objectid,
 					  btrfs_header_generation(buf),
-					  0, 0, 1);
+					  level, 1);
 		}
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
@@ -347,7 +346,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
 				  parent_start, btrfs_header_owner(parent),
-				  btrfs_header_generation(parent), 0, 0, 1);
+				  btrfs_header_generation(parent), level, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -927,7 +926,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_extent_ref(trans, root, child->start,
 					      mid->start, child->start,
 					      root->root_key.objectid,
-					      trans->transid, level - 1, 0);
+					      trans->transid, level - 1);
 		BUG_ON(ret);
 
 		add_root_to_dirty_list(root);
@@ -940,7 +939,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
 					mid->start, root->root_key.objectid,
-					btrfs_header_generation(mid), 0, 0, 1);
+					btrfs_header_generation(mid),
+					level, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -1006,7 +1006,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			wret = btrfs_free_extent(trans, root, bytenr,
 						 blocksize, parent->start,
 						 btrfs_header_owner(parent),
-						 generation, 0, 0, 1);
+						 generation, level, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
 					 parent->start,
 					 btrfs_header_owner(parent),
-					 root_gen, 0, 0, 1);
+					 root_gen, level, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -1691,13 +1691,13 @@ next_level:
 					blocksize, parent->start,
 					btrfs_header_owner(parent),
 					btrfs_header_generation(parent),
-					level - 1, 0);
+					level - 1);
 		BUG_ON(ret);
 		ret = btrfs_free_extent(trans, root, bytenr,
 					blocksize, parent->start,
 					btrfs_header_owner(parent),
 					btrfs_header_generation(parent),
-					level - 1, 0, 1);
+					level - 1, 1);
 		BUG_ON(ret);
 
 		if (generation == trans->transid) {
@@ -1973,7 +1973,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_extent_ref(trans, root, lower->start,
 				      lower->start, c->start,
 				      root->root_key.objectid,
-				      trans->transid, level - 1, 0);
+				      trans->transid, level - 1);
 	BUG_ON(ret);
 
 	/* the super has an extra ref to root->node */
@@ -3213,7 +3213,7 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 				btrfs_level_size(root, 0),
 				path->nodes[1]->start,
 				btrfs_header_owner(path->nodes[1]),
-				root_gen, 0, 0, 1);
+				root_gen, 0, 1);
 	return ret;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50fbcc9ec45..a37fd783407 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,7 +374,6 @@ struct btrfs_extent_ref {
 	__le64 root;
 	__le64 generation;
 	__le64 objectid;
-	__le64 offset;
 	__le32 num_refs;
 } __attribute__ ((__packed__));
 
@@ -1082,7 +1081,6 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
 BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
 BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
 
 BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
@@ -1090,8 +1088,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
 			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref,
-			 offset, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
 			 num_refs, 32);
 
@@ -1522,29 +1518,20 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 u64 bytenr, u64 parent,
-				 u64 root_objectid, u64 ref_generation,
-				 u64 owner, u64 owner_offset);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
 		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
+		       u64 owner, u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins);
+				u64 owner, struct btrfs_key *ins);
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins);
+				u64 owner, struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1563,7 +1550,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
 		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, u64 owner_offset, int pin);
+		      u64 owner_objectid, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -1572,12 +1559,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner, u64 owner_offset);
+			 u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
 			    u64 orig_parent, u64 parent,
 			    u64 root_objectid, u64 ref_generation,
-			    u64 owner, u64 owner_offset);
+			    u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 69db54e09fb..ab36769c356 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -525,31 +525,28 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  * - Objectid of the subvolume root
  * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
- * - offset in the file corresponding to the key holding the reference
  * - number of references holding by parent node (alway 1 for tree blocks)
  *
  * Btree leaf may hold multiple references to a file extent. In most cases,
  * these references are from same file and the corresponding offsets inside
- * the file are close together. So inode objectid and offset in file are
- * just hints, they provide hints about where in the btree the references
- * can be found and when we can stop searching.
+ * the file are close together.
  *
  * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, offset in file, 1)
+ *     (root_key.objectid, trans->transid, inode objectid, 1)
  *
  * When a leaf is cow'd new references are added for every file extent found
  * in the leaf.  It looks similar to the create case, but trans->transid will
  * be different when the block is cow'd.
  *
- *     (root_key.objectid, trans->transid, inode objectid, offset in file,
+ *     (root_key.objectid, trans->transid, inode objectid,
  *      number of references in the leaf)
  *
- * Because inode objectid and offset in file are just hints, they are not
- * used when backrefs are deleted. When a file extent is removed either
- * during snapshot deletion or file truncation, we find the corresponding
- * back back reference and check the following fields.
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
  *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf))
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid)
  *
  * Btree extents can be referenced by:
  *
@@ -558,21 +555,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  *
  * When a tree block is created, back references are inserted:
  *
- * (root->root_key.objectid, trans->transid, level, 0, 1)
+ * (root->root_key.objectid, trans->transid, level, 1)
  *
  * When a tree block is cow'd, new back references are added for all the
  * blocks it points to. If the tree block isn't in reference counted root,
  * the old back references are removed. These new back references are of
  * the form (trans->transid will have increased since creation):
  *
- * (root->root_key.objectid, trans->transid, level, 0, 1)
+ * (root->root_key.objectid, trans->transid, level, 1)
  *
  * When a backref is in deleting, the following fields are checked:
  *
  * if backref was for a tree root:
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself))
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
  * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent))
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
  *
  * Back Reference Key composing:
  *
@@ -584,13 +581,15 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 
 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
-					  struct btrfs_path *path, u64 bytenr,
-					  u64 parent, u64 ref_root,
-					  u64 ref_generation, int del)
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid, int del)
 {
 	struct btrfs_key key;
 	struct btrfs_extent_ref *ref;
 	struct extent_buffer *leaf;
+	u64 ref_objectid;
 	int ret;
 
 	key.objectid = bytenr;
@@ -607,8 +606,11 @@ static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
 	if (btrfs_ref_root(leaf, ref) != ref_root ||
-	    btrfs_ref_generation(leaf, ref) != ref_generation) {
+	    btrfs_ref_generation(leaf, ref) != ref_generation ||
+	    (ref_objectid != owner_objectid &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
 		ret = -EIO;
 		WARN_ON(1);
 		goto out;
@@ -623,7 +625,7 @@ static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
 					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid, u64 owner_offset)
+					  u64 owner_objectid)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -643,7 +645,6 @@ static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_root(leaf, ref, ref_root);
 		btrfs_set_ref_generation(leaf, ref, ref_generation);
 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_offset(leaf, ref, owner_offset);
 		btrfs_set_ref_num_refs(leaf, ref, 1);
 	} else if (ret == -EEXIST) {
 		u64 existing_owner;
@@ -663,14 +664,10 @@ static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
 
 		existing_owner = btrfs_ref_objectid(leaf, ref);
-		if (existing_owner == owner_objectid &&
-		    btrfs_ref_offset(leaf, ref) > owner_offset) {
-			btrfs_set_ref_offset(leaf, ref, owner_offset);
-		} else if (existing_owner != owner_objectid &&
-			   existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+		if (existing_owner != owner_objectid &&
+		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
 			btrfs_set_ref_objectid(leaf, ref,
 					BTRFS_MULTIPLE_OBJECTIDS);
-			btrfs_set_ref_offset(leaf, ref, 0);
 		}
 		ret = 0;
 	} else {
@@ -711,7 +708,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				     u64 orig_parent, u64 parent,
 				     u64 orig_root, u64 ref_root,
 				     u64 orig_generation, u64 ref_generation,
-				     u64 owner_objectid, u64 owner_offset)
+				     u64 owner_objectid)
 {
 	int ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
@@ -762,7 +759,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, orig_parent, orig_root,
-				    orig_generation, 1);
+				    orig_generation, owner_objectid, 1);
 	if (ret)
 		goto out;
 	ret = remove_extent_backref(trans, extent_root, path);
@@ -770,7 +767,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	ret = insert_extent_backref(trans, extent_root, path, bytenr,
 				    parent, ref_root, ref_generation,
-				    owner_objectid, owner_offset);
+				    owner_objectid);
 	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	del_pending_extents(trans, extent_root);
@@ -783,7 +780,7 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
 			    u64 orig_parent, u64 parent,
 			    u64 ref_root, u64 ref_generation,
-			    u64 owner_objectid, u64 owner_offset)
+			    u64 owner_objectid)
 {
 	int ret;
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
@@ -793,7 +790,7 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
 					parent, ref_root, ref_root,
 					ref_generation, ref_generation,
-					owner_objectid, owner_offset);
+					owner_objectid);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -803,7 +800,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  u64 orig_parent, u64 parent,
 				  u64 orig_root, u64 ref_root,
 				  u64 orig_generation, u64 ref_generation,
-				  u64 owner_objectid, u64 owner_offset)
+				  u64 owner_objectid)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -845,7 +842,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent,
 				    ref_root, ref_generation,
-				    owner_objectid, owner_offset);
+				    owner_objectid);
 	BUG_ON(ret);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
@@ -858,7 +855,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 ref_root, u64 ref_generation,
-			 u64 owner_objectid, u64 owner_offset)
+			 u64 owner_objectid)
 {
 	int ret;
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
@@ -867,7 +864,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	maybe_lock_mutex(root);
 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
 				     0, ref_root, 0, ref_generation,
-				     owner_objectid, owner_offset);
+				     owner_objectid);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -1179,7 +1176,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int ret = 0;
 	int faili = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
 	ref_generation = btrfs_header_generation(buf);
@@ -1223,7 +1220,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
-					   key.objectid, key.offset);
+					   key.objectid);
 			maybe_unlock_mutex(root);
 
 			if (ret) {
@@ -1238,7 +1235,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
-					   level - 1, 0);
+					   level - 1);
 			maybe_unlock_mutex(root);
 			if (ret) {
 				faili = i;
@@ -1314,7 +1311,7 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
-					    key.objectid, key.offset);
+					    key.objectid);
 			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
@@ -1325,7 +1322,7 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
-					    level - 1, 0);
+					    level - 1);
 			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
@@ -1781,13 +1778,14 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 						start, extent_op->parent,
 						extent_root->root_key.objectid,
 						extent_op->generation,
-						extent_op->level, 0);
+						extent_op->level);
 			BUG_ON(err);
 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
 			err = lookup_extent_backref(trans, extent_root, path,
 						start, extent_op->orig_parent,
 						extent_root->root_key.objectid,
-						extent_op->orig_generation, 0);
+						extent_op->orig_generation,
+						extent_op->level, 0);
 			BUG_ON(err);
 
 			clear_extent_bits(&info->extent_ins, start, end,
@@ -1870,8 +1868,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, u64 owner_offset,
-			 int pin, int mark_free)
+			 u64 owner_objectid, int pin, int mark_free)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1894,8 +1891,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
-	ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent,
-				    root_objectid, ref_generation, 1);
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, parent, root_objectid,
+				    ref_generation, owner_objectid, 1);
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
@@ -1926,9 +1924,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk("Unable to find ref byte nr %Lu root %Lu "
-		       " gen %Lu owner %Lu offset %Lu\n", bytenr,
-		       root_objectid, ref_generation, owner_objectid,
-		       owner_offset);
+		       "gen %Lu owner %Lu\n", bytenr,
+		       root_objectid, ref_generation, owner_objectid);
 	}
 
 	leaf = path->nodes[0];
@@ -2068,7 +2065,7 @@ free_extent:
 					    extent_op->orig_parent,
 					    extent_root->root_key.objectid,
 					    extent_op->orig_generation,
-					    extent_op->level, 0, 0, mark_free);
+					    extent_op->level, 0, mark_free);
 			kfree(extent_op);
 		} else {
 			kfree(extent_op);
@@ -2107,7 +2104,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 root_objectid, u64 ref_generation,
-			       u64 owner_objectid, u64 owner_offset, int pin)
+			       u64 owner_objectid, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
@@ -2156,8 +2153,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		pin = 1;
 
 	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
-			    root_objectid, ref_generation, owner_objectid,
-			    owner_offset, pin, pin == 0);
+			    root_objectid, ref_generation,
+			    owner_objectid, pin, pin == 0);
 
 	finish_current_insert(trans, root->fs_info->extent_root);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
@@ -2168,14 +2165,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
 		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, u64 owner_offset, int pin)
+		      u64 owner_objectid, int pin)
 {
 	int ret;
 
 	maybe_lock_mutex(root);
 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
-				  owner_objectid, owner_offset, pin);
+				  owner_objectid, pin);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2522,8 +2519,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, u64 owner_offset,
-					 struct btrfs_key *ins)
+					 u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -2597,7 +2593,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
 	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -2629,17 +2624,15 @@ out:
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins)
+				u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
 	maybe_lock_mutex(root);
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-					    root_objectid, ref_generation,
-					    owner, owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
 	maybe_unlock_mutex(root);
 	return ret;
@@ -2653,8 +2646,7 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins)
+				u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
@@ -2665,9 +2657,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
 	BUG_ON(ret);
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-					    root_objectid, ref_generation,
-					    owner, owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2683,8 +2674,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_alloc_size,
 		       u64 root_objectid, u64 ref_generation,
-		       u64 owner_objectid, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
+		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
@@ -2698,7 +2688,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
 					root_objectid, ref_generation,
-					owner_objectid, owner_offset, ins);
+					owner_objectid, ins);
 		BUG_ON(ret);
 
 	} else {
@@ -2750,7 +2740,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *buf;
 
 	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
-				 root_objectid, ref_generation, level, 0,
+				 root_objectid, ref_generation, level,
 				 empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
@@ -2800,7 +2790,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
-				key.objectid, key.offset, 0);
+				key.objectid, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 
@@ -2824,7 +2814,7 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
-					  info->objectid, info->offset, 0);
+					  info->objectid, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2940,7 +2930,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
-						root_owner, root_gen, 0, 0, 1);
+						root_owner, root_gen,
+						*level - 1, 1);
 			BUG_ON(ret);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 
@@ -2970,9 +2961,10 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				*level = 0;
 				break;
 			}
-			if (printk_ratelimit())
+			if (printk_ratelimit()) {
 				printk("leaf ref miss for bytenr %llu\n",
 				       (unsigned long long)bytenr);
+			}
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
@@ -3020,7 +3012,7 @@ out:
 	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
-				  0, 0, 1);
+				  *level, 1);
 	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
@@ -3073,8 +3065,8 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len,
-						parent->start,
-						root_owner, root_gen, 0, 0, 1);
+						parent->start, root_owner,
+						root_gen, *level, 1);
 			BUG_ON(ret);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -3308,7 +3300,6 @@ struct btrfs_ref_path {
 	u64 root_objectid;
 	u64 root_generation;
 	u64 owner_objectid;
-	u64 owner_offset;
 	u32 num_refs;
 	int lowest_level;
 	int current_level;
@@ -3480,7 +3471,6 @@ found:
 
 		if (ref_path->lowest_level == level) {
 			ref_path->owner_objectid = ref_objectid;
-			ref_path->owner_offset = btrfs_ref_offset(leaf, ref);
 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
 		}
 
@@ -3686,16 +3676,20 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
 	u64 ext_offset;
 	u64 first_pos;
 	u32 nritems;
+	int nr_scaned = 0;
 	int extent_locked = 0;
 	int ret;
 
-	first_pos = ref_path->owner_offset;
+	memcpy(&key, leaf_key, sizeof(key));
+	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-		key.objectid = ref_path->owner_objectid;
-		key.offset = ref_path->owner_offset;
-		key.type = BTRFS_EXTENT_DATA_KEY;
-	} else {
-		memcpy(&key, leaf_key, sizeof(key));
+		if (key.objectid < ref_path->owner_objectid ||
+		    (key.objectid == ref_path->owner_objectid &&
+		     key.type < BTRFS_EXTENT_DATA_KEY)) {
+			key.objectid = ref_path->owner_objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = 0;
+		}
 	}
 
 	while (1) {
@@ -3718,8 +3712,7 @@ next:
 		}
 
 		if (path->slots[0] >= nritems) {
-			if (ref_path->owner_objectid ==
-			    BTRFS_MULTIPLE_OBJECTIDS)
+			if (++nr_scaned > 2)
 				break;
 
 			BUG_ON(extent_locked);
@@ -3858,7 +3851,7 @@ next:
 						leaf->start,
 						root->root_key.objectid,
 						trans->transid,
-						key.objectid, key.offset);
+						key.objectid);
 			BUG_ON(ret);
 
 			ret = btrfs_free_extent(trans, root,
@@ -3867,7 +3860,7 @@ next:
 						leaf->start,
 						btrfs_header_owner(leaf),
 						btrfs_header_generation(leaf),
-						key.objectid, key.offset, 0);
+						key.objectid, 0);
 			BUG_ON(ret);
 
 			btrfs_release_path(root, path);
@@ -3925,8 +3918,7 @@ next:
 						new_extents[i].disk_num_bytes,
 						leaf->start,
 						root->root_key.objectid,
-						trans->transid,
-						key.objectid, key.offset);
+						trans->transid, key.objectid);
 				BUG_ON(ret);
 				btrfs_release_path(root, path);
 
@@ -4182,14 +4174,13 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					new_extent->disk_num_bytes,
 					leaf->start,
 					root->root_key.objectid,
-					trans->transid,
-					key.objectid, key.offset);
+					trans->transid, key.objectid);
 		BUG_ON(ret);
 		ret = btrfs_free_extent(trans, root,
 					bytenr, num_bytes, leaf->start,
 					btrfs_header_owner(leaf),
 					btrfs_header_generation(leaf),
-					key.objectid, key.offset, 0);
+					key.objectid, 0);
 		BUG_ON(ret);
 		cond_resched();
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 18dfdf5f91d..69abbe19add 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -788,8 +788,7 @@ next_slot:
 						le64_to_cpu(old.disk_num_bytes),
 						leaf->start,
 						root->root_key.objectid,
-						trans->transid,
-						ins.objectid, ins.offset);
+						trans->transid, ins.objectid);
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
@@ -808,8 +807,7 @@ next_slot:
 						disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
 						leaf_start, root_owner,
-						root_gen, key.objectid,
-						key.offset, 0);
+						root_gen, key.objectid, 0);
 				BUG_ON(ret);
 				*hint_byte = disk_bytenr;
 			}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9df89c5fdf..3ab147dc3c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -647,8 +647,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
 					  root->root_key.objectid,
-					  trans->transid, inode->i_ino,
-					  ordered_extent->file_offset, &ins);
+					  trans->transid, inode->i_ino, &ins);
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
@@ -1734,8 +1733,7 @@ delete:
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
 						leaf->start, root_owner,
-						root_gen, inode->i_ino,
-						found_key.offset, 0);
+						root_gen, inode->i_ino, 0);
 			BUG_ON(ret);
 		}
 next:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab7a0f61ded..50c8a066d1f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -658,7 +658,7 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 						     ds, dl, leaf->start,
 						     root->root_key.objectid,
 						     trans->transid,
-						     inode->i_ino, key.offset);
+						     inode->i_ino);
 					BUG_ON(ret);
 				}
 			}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 3577badfa5b..bd9ab3e9a7f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -102,11 +102,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
 			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu offset %llu num_refs %lu\n",
+			       "owner %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
-			       (unsigned long long)btrfs_ref_offset(l, ref),
 			       (unsigned long)btrfs_ref_num_refs(l, ref));
 			break;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8df719a73d8..cf618cc8b34 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -460,8 +460,7 @@ insert:
 						ins.objectid, ins.offset,
 						path->nodes[0]->start,
 						root->root_key.objectid,
-						trans->transid,
-						key->objectid, key->offset);
+						trans->transid, key->objectid);
 			} else {
 				/*
 				 * insert the extent pointer in the extent
@@ -471,7 +470,7 @@ insert:
 						path->nodes[0]->start,
 						root->root_key.objectid,
 						trans->transid, key->objectid,
-						key->offset, &ins);
+						&ins);
 				BUG_ON(ret);
 			}
 		}
@@ -2534,8 +2533,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   dst_path->nodes[0]->start,
 						   BTRFS_TREE_LOG_OBJECTID,
 						   trans->transid,
-						   ins_keys[i].objectid,
-						   ins_keys[i].offset);
+						   ins_keys[i].objectid);
 					BUG_ON(ret);
 				}
 			}
-- 
cgit v1.2.3-70-g09d2


From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:59 -0400
Subject: Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing,
both for inline and regular extents.  It does some fairly large
surgery to the writeback paths.

Compression is off by default and enabled by mount -o compress.  Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.

If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.

* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler.  This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.

* Inline extents are inserted at delalloc time now.  This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.

* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.

From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field.  Neither the encryption or the
'other' field are currently used.

In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k.  This is a
software only limit, the disk format supports u64 sized compressed extents.

In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k.  This is a software only limit
and will be subject to tuning later.

Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data.  This way additional encodings can be
layered on without having to figure out which encoding to checksum.

Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread.  This makes it tricky to
spread the compression load across all the cpus on the box.  We'll have to
look at parallel pdflush walks of dirty inodes at a later time.

Decompression is hooked into readpages and it does spread across CPUs nicely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/Kconfig              |   2 +
 fs/btrfs/Makefile       |   3 +-
 fs/btrfs/compression.c  | 454 ++++++++++++++++++++++++++++++++++
 fs/btrfs/compression.h  |  47 ++++
 fs/btrfs/ctree.h        |  99 ++++++--
 fs/btrfs/disk-io.c      |  18 +-
 fs/btrfs/disk-io.h      |   1 +
 fs/btrfs/extent-tree.c  |  27 +-
 fs/btrfs/extent_io.c    | 411 ++++++++++++++++++++++++-------
 fs/btrfs/extent_io.h    |  17 +-
 fs/btrfs/extent_map.c   |   9 +
 fs/btrfs/extent_map.h   |   6 +-
 fs/btrfs/file-item.c    |  75 +++++-
 fs/btrfs/file.c         | 263 +++++---------------
 fs/btrfs/inode.c        | 584 ++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ordered-data.c |   9 +-
 fs/btrfs/ordered-data.h |  10 +-
 fs/btrfs/print-tree.c   |   7 +-
 fs/btrfs/super.c        |  10 +-
 fs/btrfs/tree-log.c     |   3 +-
 fs/btrfs/volumes.c      |   2 +
 fs/btrfs/zlib.c         | 637 ++++++++++++++++++++++++++++++++++++++++++++++++
 22 files changed, 2315 insertions(+), 379 deletions(-)
 create mode 100644 fs/btrfs/compression.c
 create mode 100644 fs/btrfs/compression.h
 create mode 100644 fs/btrfs/zlib.c

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/Kconfig b/fs/Kconfig
index 18f5a85b47c..31cce5d88b1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
 	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
 	depends on EXPERIMENTAL
 	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142..d2cf5a54a4b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..c5470367ca5
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	struct bio *bio;
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_byte >> 9;
+	}
+	return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+					cb->start,
+					cb->orig_bio->bi_io_vec,
+					cb->orig_bio->bi_vcnt,
+					cb->compressed_len);
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors)
+		bio_io_error(cb->orig_bio);
+	else
+		bio_endio(cb->orig_bio, 0);
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+					     unsigned long ram_size)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+	return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL, 1);
+
+	end_compressed_writeback(inode, cb->start, cb->len);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int page_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	ret = btrfs_csum_file_bytes(root, inode, start, len);
+	BUG_ON(ret);
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	while(bytes_left > 0) {
+		page = compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_size)
+			ret = io_tree->ops->merge_bio_hook(page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret);
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		page_index++;
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long page_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	struct extent_map *em;
+	int ret;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+
+	cb->start = em->start;
+	compressed_len = em->block_len;
+	free_extent_map(em);
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = bio;
+
+	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+				 PAGE_CACHE_SIZE;
+	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+				       GFP_NOFS);
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+	}
+	cb->nr_pages = nr_pages;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		page = cb->compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (comp_bio->bi_size)
+			ret = tree->ops->merge_bio_hook(page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			BUG_ON(ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	BUG_ON(ret);
+
+	bio_put(comp_bio);
+	return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000000..421f5b4aa71
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				  unsigned long len, u64 disk_start,
+				  unsigned long compressed_len,
+				  struct page **compressed_pages,
+				  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47..793d8fdda24 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-/*
- * there is no padding here on purpose.  If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+	BTRFS_COMPRESS_NONE = 0,
+	BTRFS_COMPRESS_ZLIB = 1,
+	BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+	BTRFS_ENCRYPTION_NONE = 0,
+	BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
 struct btrfs_inode_item {
 	/* nfs style generation number */
 	__le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
 	__le64 rdev;
 	__le16 flags;
 	__le16 compat_flags;
+
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
 #define BTRFS_FILE_EXTENT_INLINE 1
 
 struct btrfs_file_extent_item {
+	/*
+	 * transaction id that created this extent
+	 */
 	__le64 generation;
+	/*
+	 * max number of bytes to hold this extent in ram
+	 * when we split a compressed extent we can't know how big
+	 * each of the resulting pieces will be.  So, this is
+	 * an upper limit on the size of the extent in ram instead of
+	 * an exact limit.
+	 */
+	__le64 ram_bytes;
+
+	/*
+	 * 32 bits for the various ways we might encode the data,
+	 * including compression and encryption.  If any of these
+	 * are set to something a given disk format doesn't understand
+	 * it is treated like an incompat flag for reading and writing,
+	 * but not for stat.
+	 */
+	u8 compression;
+	u8 encryption;
+	__le16 other_encoding; /* spare for later use */
+
+	/* are we inline data or a real extent? */
 	u8 type;
+
 	/*
 	 * disk space consumed by the extent, checksum blocks are included
 	 * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
 	 */
 	__le64 offset;
 	/*
-	 * the logical number of file blocks (no csums included)
+	 * the logical number of file blocks (no csums included).  This
+	 * always reflects the size uncompressed and without encoding.
 	 */
 	__le64 num_bytes;
+
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
 #define BTRFS_MOUNT_SSD			(1 << 3)
 #define BTRFS_MOUNT_DEGRADED		(1 << 4)
+#define BTRFS_MOUNT_COMPRESS		(1 << 5)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
 #define BTRFS_INODE_NODATASUM		(1 << 0)
 #define BTRFS_INODE_NODATACOW		(1 << 1)
 #define BTRFS_INODE_READONLY		(1 << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
 #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
 					 ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
 	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
 }
 
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       struct btrfs_item *e)
-{
-	unsigned long offset;
-	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-	return btrfs_item_size(eb, e) - offset;
-}
-
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
 		   disk_bytenr, 64);
 BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
 		  offset, 64);
 BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
 		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       struct btrfs_file_extent_item *e)
+{
+	return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+						    struct btrfs_item *e)
+{
+	unsigned long offset;
+	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return btrfs_item_size(eb, e) - offset;
+}
 
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 objectid, u64 pos, u64 disk_offset,
-			       u64 disk_num_bytes,
-			     u64 num_bytes, u64 offset);
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 				  int namelen);
 
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio);
+			 size_t size, struct bio *bio, unsigned long bio_flags);
 
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb619..dc95f636a11 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int rw;
 	int mirror_num;
+	unsigned long bio_flags;
 	struct btrfs_work work;
 };
 
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	}
 	em->start = 0;
 	em->len = (u64)-1;
+	em->block_len = (u64)-1;
 	em->block_start = 0;
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
 		wake_up(&fs_info->async_submit_wait);
 
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
-			       async->mirror_num);
+			       async->mirror_num, async->bio_flags);
 	kfree(async);
 }
 
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
 			extent_submit_bio_hook_t *submit_bio_hook)
 {
 	struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_hook = submit_bio_hook;
 	async->work.func = run_one_async_submit;
 	async->work.flags = 0;
+	async->bio_flags = bio_flags;
 
 	while(atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
 }
 
 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num)
+				 int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num)
+				 int mirror_num, unsigned long bio_flags)
 {
 	/*
 	 * kthread helpers are used to submit writes so that checksumming
 	 * can happen in parallel across all CPUs
 	 */
 	if (!(rw & (1 << BIO_RW))) {
-		return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+		return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, rw, bio, mirror_num,
+				   inode, rw, bio, mirror_num, 0,
 				   __btree_submit_bio_hook);
 }
 
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+
 	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
 	INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	btrfs_init_workers(&fs_info->workers, "worker",
 			   fs_info->thread_pool_size);
+
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
 
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbb..4eb1f1408d2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6..bbf04e80a1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
 
 	em->start = extent_key->objectid - offset;
 	em->len = extent_key->offset;
+	em->block_len = extent_key->offset;
 	em->block_start = extent_key->objectid;
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
 };
 
 struct disk_extent {
+	u64 ram_bytes;
 	u64 disk_bytenr;
 	u64 disk_num_bytes;
 	u64 offset;
 	u64 num_bytes;
+	u8 compression;
+	u8 encryption;
+	u16 other_encoding;
 };
 
 static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
 			btrfs_file_extent_disk_num_bytes(leaf, fi);
 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+									   fi);
 		WARN_ON(exts[nr].offset > 0);
 		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
@@ -3846,6 +3856,8 @@ next:
 						new_extents[0].disk_bytenr);
 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[0].disk_num_bytes);
+			btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[0].ram_bytes);
 			ext_offset += new_extents[0].offset;
 			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
 			btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
 						new_extents[i].disk_bytenr);
 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[i].disk_num_bytes);
+				btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[i].ram_bytes);
+
+				btrfs_set_file_extent_compression(leaf, fi,
+						new_extents[i].compression);
+				btrfs_set_file_extent_encryption(leaf, fi,
+						new_extents[i].encryption);
+				btrfs_set_file_extent_other_encoding(leaf, fi,
+						new_extents[i].other_encoding);
+
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_len);
 				ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
 
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extent->ram_bytes);
 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extent->disk_bytenr);
 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	BUG_ON(err);
 
 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-				       group->key.offset, 0);
+				       group->key.offset, 0, group->key.offset,
+				       0, 0, 0);
 	BUG_ON(err);
 
 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f..314041fdfa4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
+#define LEAK_DEBUG 1
 #ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 #endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-					     u64 *start, u64 *end, u64 max_bytes)
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+					u64 *start, u64 *end, u64 max_bytes)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 	u64 total_bytes = 0;
 
 	spin_lock_irq(&tree->lock);
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-search_again:
 	node = tree_search(tree, cur_start);
 	if (!node) {
 		if (!found)
@@ -1100,40 +1101,6 @@ search_again:
 				*end = state->end;
 			goto out;
 		}
-		if (!found && !(state->state & EXTENT_BOUNDARY)) {
-			struct extent_state *prev_state;
-			struct rb_node *prev_node = node;
-			while(1) {
-				prev_node = rb_prev(prev_node);
-				if (!prev_node)
-					break;
-				prev_state = rb_entry(prev_node,
-						      struct extent_state,
-						      rb_node);
-				if ((prev_state->end + 1 != state->start) ||
-				    !(prev_state->state & EXTENT_DELALLOC))
-					break;
-				if ((cur_start - prev_state->start) * 2 >
-				     max_bytes)
-					break;
-				state = prev_state;
-				node = prev_node;
-			}
-		}
-		if (state->state & EXTENT_LOCKED) {
-			DEFINE_WAIT(wait);
-			atomic_inc(&state->refs);
-			prepare_to_wait(&state->wq, &wait,
-					TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&tree->lock);
-			schedule();
-			spin_lock_irq(&tree->lock);
-			finish_wait(&state->wq, &wait);
-			free_extent_state(state);
-			goto search_again;
-		}
-		set_state_cb(tree, state, EXTENT_LOCKED);
-		state->state |= EXTENT_LOCKED;
 		if (!found)
 			*start = state->start;
 		found++;
@@ -1151,6 +1118,208 @@ out:
 	return found;
 }
 
+static noinline int __unlock_for_delalloc(struct inode *inode,
+					  struct page *locked_page,
+					  u64 start, u64 end)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	if (index == locked_page->index && end_index == index)
+		return 0;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] != locked_page)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+					struct page *locked_page,
+					u64 delalloc_start,
+					u64 delalloc_end)
+{
+	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long start_index = index;
+	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long pages_locked = 0;
+	struct page *pages[16];
+	unsigned long nrpages;
+	int ret;
+	int i;
+
+	/* the caller is responsible for locking the start index */
+	if (index == locked_page->index && index == end_index)
+		return 0;
+
+	/* skip the page at the start index */
+	nrpages = end_index - index + 1;
+	while(nrpages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nrpages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			ret = -EAGAIN;
+			goto done;
+		}
+		/* now we have an array of pages, lock them all */
+		for (i = 0; i < ret; i++) {
+			/*
+			 * the caller is taking responsibility for
+			 * locked_page
+			 */
+			if (pages[i] != locked_page)
+				lock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		pages_locked += ret;
+		nrpages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	ret = 0;
+done:
+	if (ret && pages_locked) {
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start,
+			      ((u64)(start_index + pages_locked - 1)) <<
+			      PAGE_CACHE_SHIFT);
+	}
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+					     struct extent_io_tree *tree,
+					     struct page *locked_page,
+					     u64 *start, u64 *end,
+					     u64 max_bytes)
+{
+	u64 delalloc_start;
+	u64 delalloc_end;
+	u64 found;
+	int ret;
+	int loops = 0;
+
+again:
+	/* step one, find a bunch of delalloc bytes starting at start */
+	delalloc_start = *start;
+	delalloc_end = 0;
+	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+				    max_bytes);
+	if (!found) {
+		*start = delalloc_start;
+		*end = delalloc_end;
+		return found;
+	}
+
+	/*
+	 * make sure to limit the number of pages we try to lock down
+	 * if we're looping.
+	 */
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+		delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+			~((u64)PAGE_CACHE_SIZE - 1);
+	}
+	/* step two, lock all the pages after the page that has start */
+	ret = lock_delalloc_pages(inode, locked_page,
+				  delalloc_start, delalloc_end);
+	if (ret == -EAGAIN) {
+		/* some of the pages are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching
+		 */
+		if (!loops) {
+			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+			max_bytes = PAGE_CACHE_SIZE - offset;
+			loops = 1;
+			goto again;
+		} else {
+			found = 0;
+			goto out_failed;
+		}
+	}
+	BUG_ON(ret);
+
+	/* step three, lock the state bits for the whole range */
+	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+	/* then test to make sure it is all still delalloc */
+	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+			     EXTENT_DELALLOC, 1);
+	if (!ret) {
+		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start, delalloc_end);
+		cond_resched();
+		goto again;
+	}
+	*start = delalloc_start;
+	*end = delalloc_end;
+out_failed:
+	return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int clear_dirty, int set_writeback,
+				int end_writeback)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+
+	if (clear_dirty)
+		clear_bits |= EXTENT_DIRTY;
+
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] == locked_page) {
+				page_cache_release(pages[i]);
+				continue;
+			}
+			if (clear_dirty)
+				clear_page_dirty_for_io(pages[i]);
+			if (set_writeback)
+				set_page_writeback(pages[i]);
+			if (end_writeback)
+				end_page_writeback(pages[i]);
+			unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
+
 /*
  * count the number of bytes in the tree that have a given bit(s)
  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+			  unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
-	struct rb_node *node;
-	struct extent_state *state;
 	u64 start;
 	u64 end;
 
 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
 	end = start + bvec->bv_len - 1;
 
-	spin_lock_irq(&tree->lock);
-	node = __etree_search(tree, start, NULL, NULL);
-	BUG_ON(!node);
-	state = rb_entry(node, struct extent_state, rb_node);
-	while(state->end < end) {
-		node = rb_next(node);
-		state = rb_entry(node, struct extent_state, rb_node);
-	}
-	BUG_ON(state->end != end);
-	spin_unlock_irq(&tree->lock);
-
 	bio->bi_private = NULL;
 
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-					   mirror_num);
+					   mirror_num, bio_flags);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct bio **bio_ret,
 			      unsigned long max_pages,
 			      bio_end_io_t end_io_func,
-			      int mirror_num)
+			      int mirror_num,
+			      unsigned long prev_bio_flags,
+			      unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio *bio;
 	int nr;
+	int contig = 0;
+	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+	size_t page_size = min(size, PAGE_CACHE_SIZE);
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
-		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		if (old_compressed)
+			contig = bio->bi_sector == sector;
+		else
+			contig = bio->bi_sector + (bio->bi_size >> 9) ==
+				sector;
+
+		if (prev_bio_flags != bio_flags || !contig ||
 		    (tree->ops && tree->ops->merge_bio_hook &&
-		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
-		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio, mirror_num);
+		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+					       bio_flags)) ||
+		    bio_add_page(bio, page, page_size, offset) < page_size) {
+			ret = submit_one_bio(rw, bio, mirror_num,
+					     prev_bio_flags);
 			bio = NULL;
 		} else {
 			return 0;
 		}
 	}
-	nr = bio_get_nr_vecs(bdev);
+	if (this_compressed)
+		nr = BIO_MAX_PAGES;
+	else
+		nr = bio_get_nr_vecs(bdev);
+
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
 	if (!bio) {
 		printk("failed to allocate bio nr %d\n", nr);
 	}
 
-
-	bio_add_page(bio, page, size, offset);
+	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 
 	if (bio_ret) {
 		*bio_ret = bio;
 	} else {
-		ret = submit_one_bio(rw, bio, mirror_num);
+		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
 	}
 
 	return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
-				   struct bio **bio, int mirror_num)
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	int nr = 0;
 	size_t page_offset = 0;
 	size_t iosize;
+	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
+	unsigned long this_bio_flag = 0;
 
 	set_page_extent_mapped(page);
 
 	end = page_end;
 	lock_extent(tree, start, end, GFP_NOFS);
 
+	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+		char *userpage;
+		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+		if (zero_offset) {
+			iosize = PAGE_CACHE_SIZE - zero_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + zero_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+		}
+	}
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		BUG_ON(end < cur);
 
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			this_bio_flag = EXTENT_BIO_COMPRESSED;
+
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+			disk_io_size = em->block_len;
+			sector = em->block_start >> 9;
+		} else {
+			sector = (em->block_start + extent_offset) >> 9;
+			disk_io_size = iosize;
+		}
 		bdev = em->bdev;
 		block_start = em->block_start;
 		free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
 			pnr -= page->index;
 			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset,
+					 sector, disk_io_size, page_offset,
 					 bdev, bio, pnr,
-					 end_bio_extent_readpage, mirror_num);
+					 end_bio_extent_readpage, mirror_num,
+					 *bio_flags,
+					 this_bio_flag);
 			nr++;
+			*bio_flags = this_bio_flag;
 		}
 		if (ret)
 			SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			    get_extent_t *get_extent)
 {
 	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+				      &bio_flags);
 	if (bio)
-		submit_one_bio(READ, bio, 0);
+		submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
 	u64 nr_delalloc;
 	u64 delalloc_end;
+	int page_started;
+	int compressed;
 
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	delalloc_start = start;
 	delalloc_end = 0;
+	page_started = 0;
 	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+		nr_delalloc = find_lock_delalloc_range(inode, tree,
+						       page,
+						       &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
 		if (nr_delalloc == 0) {
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		tree->ops->fill_delalloc(inode, delalloc_start,
-					 delalloc_end);
-		clear_extent_bit(tree, delalloc_start,
-				 delalloc_end,
-				 EXTENT_LOCKED | EXTENT_DELALLOC,
-				 1, 0, GFP_NOFS);
+		tree->ops->fill_delalloc(inode, page, delalloc_start,
+					 delalloc_end, &page_started);
 		delalloc_start = delalloc_end + 1;
 	}
+
+	/* did the fill delalloc function already unlock and start the IO? */
+	if (page_started) {
+		return 0;
+	}
+
 	lock_extent(tree, start, page_end, GFP_NOFS);
 	unlock_start = start;
 
 	if (tree->ops && tree->ops->writepage_start_hook) {
-		ret = tree->ops->writepage_start_hook(page, start, page_end);
+		ret = tree->ops->writepage_start_hook(page, start,
+						      page_end);
 		if (ret == -EAGAIN) {
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		free_extent_map(em);
 		em = NULL;
 
-		if (block_start == EXTENT_MAP_HOLE ||
+		/*
+		 * compressed and inline extents are written through other
+		 * paths in the FS
+		 */
+		if (compressed || block_start == EXTENT_MAP_HOLE ||
 		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			unlock_extent(tree, unlock_start, cur + iosize -1,
 				      GFP_NOFS);
 
-			if (tree->ops && tree->ops->writepage_end_io_hook)
+			/*
+			 * end_io notification does not happen here for
+			 * compressed extents
+			 */
+			if (!compressed && tree->ops &&
+			    tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 cur + iosize - 1,
 							 NULL, 1);
-			cur = cur + iosize;
+			else if (compressed) {
+				/* we don't want to end_page_writeback on
+				 * a compressed extent.  this happens
+				 * elsewhere
+				 */
+				nr++;
+			}
+
+			cur += iosize;
 			pg_offset += iosize;
 			unlock_start = cur;
 			continue;
 		}
-
 		/* leave this out until we have a page_mkwrite call */
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
 				   EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			pg_offset += iosize;
 			continue;
 		}
+
 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, pg_offset, bdev,
 						 &epd->bio, max_nr,
-						 end_bio_extent_writepage, 0);
+						 end_bio_extent_writepage,
+						 0, 0, 0);
 			if (ret)
 				SetPageError(page);
 		}
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio, 0);
+		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
 	return ret;
 }
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio, 0);
+		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
 	return ret;
 }
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	struct pagevec pvec;
+	unsigned long bio_flags = 0;
 
 	pagevec_init(&pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
 			if (!pagevec_add(&pvec, page))
 				__pagevec_lru_add(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
-						&bio, 0);
+						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		__pagevec_lru_add(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio, 0);
+		submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
-					 end_bio_extent_preparewrite, 0);
+					 end_bio_extent_preparewrite, 0,
+					 0, 0);
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
-					    EXTENT_LOCKED, 0)) {
+					    EXTENT_LOCKED | EXTENT_WRITEBACK |
+					    EXTENT_ORDERED,
+					    0)) {
 				remove_extent_mapping(map, em);
 				/* once for the rb tree */
 				free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	int inc_all_pages = 0;
 	unsigned long num_pages;
 	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
 
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
-						      mirror_num);
+						      mirror_num, &bio_flags);
 			if (err) {
 				ret = err;
 				printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (bio)
-		submit_one_bio(READ, bio, mirror_num);
+		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
 	if (ret || !wait) {
 		if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae..86f859b87a6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
 #define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -28,14 +31,17 @@
 struct extent_state;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
-				       struct bio *bio, int mirror_num);
+				       struct bio *bio, int mirror_num,
+				       unsigned long bio_flags);
 struct extent_io_ops {
-	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started);
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
-			      size_t size, struct bio *bio);
+			      size_t size, struct bio *bio,
+			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
 				       u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
 			  u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int clear_dirty, int set_writeback,
+				int clear_writeback);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d..fd3ebfb8c3c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
 		return 0;
 
+	/*
+	 * don't merge compressed extents, we need to know their
+	 * actual size
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		if (rb && mergable_maps(merge, em)) {
 			em->start = merge->start;
 			em->len += merge->len;
+			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
 			rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		merge = rb_entry(rb, struct extent_map, rb_node);
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
+		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b26..abbcbeb28c7 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
 	u64 start;
 	u64 len;
 	u64 block_start;
+	u64 block_len;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
 
 static inline u64 extent_map_block_end(struct extent_map *em)
 {
-	if (em->block_start + em->len < em->block_start)
+	if (em->block_start + em->block_len < em->block_start)
 		return (u64)-1;
-	return em->block_start + em->len;
+	return em->block_start + em->block_len;
 }
 
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d..f4d3fa71bc4 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
 			     u64 disk_offset, u64 disk_num_bytes,
-			     u64 num_bytes, u64 offset)
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
 	btrfs_set_file_extent_offset(leaf, item, offset);
 	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, compression);
+	btrfs_set_file_extent_encryption(leaf, item, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
 	btrfs_mark_buffer_dirty(leaf);
 out:
 	btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
 	return 0;
 }
 
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	char *data;
+	struct page *page;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+
+	sector_sum = sums->sums;
+	sums->file_offset = start;
+	sums->len = len;
+	INIT_LIST_HEAD(&sums->list);
+	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+	BUG_ON(!ordered);
+
+	while(len > 0) {
+		if (start >= ordered->file_offset + ordered->len ||
+		    start < ordered->file_offset) {
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, len),
+				       GFP_NOFS);
+			BUG_ON(!sums);
+			sector_sum = sums->sums;
+			sums->len = len;
+			sums->file_offset = start;
+			ordered = btrfs_lookup_ordered_extent(inode,
+						      sums->file_offset);
+			BUG_ON(!ordered);
+		}
+
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+
+		data = kmap_atomic(page, KM_USER0);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+						  PAGE_CACHE_SIZE);
+		kunmap_atomic(data, KM_USER0);
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->offset = page_offset(page);
+		page_cache_release(page);
+
+		sector_sum++;
+		total_bytes += PAGE_CACHE_SIZE;
+		this_sum_bytes += PAGE_CACHE_SIZE;
+		start += PAGE_CACHE_SIZE;
+
+		WARN_ON(len < PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+	}
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
 {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add..0aa15436590 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
-/* this does all the hard work for inserting an inline extent into
- * the btree.  Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode,
-				u64 offset, size_t size,
-				struct page **pages, size_t page_offset,
-				int num_pages)
-{
-	struct btrfs_key key;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	char *kaddr;
-	unsigned long ptr;
-	struct btrfs_file_extent_item *ei;
-	struct page *page;
-	u32 datasize;
-	int err = 0;
-	int ret;
-	int i;
-	ssize_t cur_size;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	btrfs_set_trans_block_group(trans, inode);
-
-	key.objectid = inode->i_ino;
-	key.offset = offset;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-	if (ret < 0) {
-		err = ret;
-		goto fail;
-	}
-	if (ret == 1) {
-		struct btrfs_key found_key;
-
-		if (path->slots[0] == 0)
-			goto insert;
-
-		path->slots[0]--;
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-		if (found_key.objectid != inode->i_ino)
-			goto insert;
-
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			goto insert;
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, ei) !=
-		    BTRFS_FILE_EXTENT_INLINE) {
-			goto insert;
-		}
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		ret = 0;
-	}
-	if (ret == 0) {
-		u32 found_size;
-		u64 found_end;
-
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, ei) !=
-		    BTRFS_FILE_EXTENT_INLINE) {
-			err = ret;
-			btrfs_print_leaf(root, leaf);
-			printk("found wasn't inline offset %Lu inode %lu\n",
-			       offset, inode->i_ino);
-			goto fail;
-		}
-		found_size = btrfs_file_extent_inline_len(leaf,
-					  btrfs_item_nr(leaf, path->slots[0]));
-		found_end = key.offset + found_size;
-
-		if (found_end < offset + size) {
-			btrfs_release_path(root, path);
-			ret = btrfs_search_slot(trans, root, &key, path,
-						offset + size - found_end, 1);
-			BUG_ON(ret != 0);
-
-			ret = btrfs_extend_item(trans, root, path,
-						offset + size - found_end);
-			if (ret) {
-				err = ret;
-				goto fail;
-			}
-			leaf = path->nodes[0];
-			ei = btrfs_item_ptr(leaf, path->slots[0],
-					    struct btrfs_file_extent_item);
-			inode_add_bytes(inode, offset + size - found_end);
-		}
-		if (found_end < offset) {
-			ptr = btrfs_file_extent_inline_start(ei) + found_size;
-			memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-		}
-	} else {
-insert:
-		btrfs_release_path(root, path);
-		datasize = offset + size - key.offset;
-		inode_add_bytes(inode, datasize);
-		datasize = btrfs_file_extent_calc_inline_size(datasize);
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-					      datasize);
-		if (ret) {
-			err = ret;
-			printk("got bad ret %d\n", ret);
-			goto fail;
-		}
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-		btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-	}
-	ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
-	cur_size = size;
-	i = 0;
-	while (size > 0) {
-		page = pages[i];
-		kaddr = kmap_atomic(page, KM_USER0);
-		cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-		kunmap_atomic(kaddr, KM_USER0);
-		page_offset = 0;
-		ptr += cur_size;
-		size -= cur_size;
-		if (i >= num_pages) {
-			printk("i %d num_pages %d\n", i, num_pages);
-		}
-		i++;
-	}
-	btrfs_mark_buffer_dirty(leaf);
-fail:
-	btrfs_free_path(path);
-	return err;
-}
-
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
-	u64 inline_size;
-	int did_inline = 0;
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
-						       0, 0, hole_size, 0);
+						       0, 0, hole_size, 0,
+						       hole_size, 0, 0, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size - 1, 0);
 			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			goto failed;
 	}
 
-	/*
-	 * either allocate an extent for the new bytes or setup the key
-	 * to show we are doing inline data in the extent
+	/* check for reserved extents on each page, we don't want
+	 * to reset the delalloc bit on things that already have
+	 * extents reserved.
 	 */
-	inline_size = end_pos;
-	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > root->fs_info->max_inline ||
-	    (inline_size & (root->sectorsize -1)) == 0 ||
-	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		/* check for reserved extents on each page, we don't want
-		 * to reset the delalloc bit on things that already have
-		 * extents reserved.
-		 */
-		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-		for (i = 0; i < num_pages; i++) {
-			struct page *p = pages[i];
-			SetPageUptodate(p);
-			ClearPageChecked(p);
-			set_page_dirty(p);
-		}
-	} else {
-		u64 aligned_end;
-		/* step one, delete the existing extents in this range */
-		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-		mutex_lock(&BTRFS_I(inode)->extent_mutex);
-		err = btrfs_drop_extents(trans, root, inode, start_pos,
-					 aligned_end, aligned_end, &hint_byte);
-		if (err)
-			goto failed;
-		if (isize > inline_size)
-			inline_size = min_t(u64, isize, aligned_end);
-		inline_size -= start_pos;
-		err = insert_inline_extent(trans, root, inode, start_pos,
-					   inline_size, pages, 0, num_pages);
-		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-		BUG_ON(err);
-		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
-		/*
-		 * an ugly way to do all the prop accounting around
-		 * the page bits and mapping tags
-		 */
-		set_page_writeback(pages[0]);
-		end_page_writeback(pages[0]);
-		did_inline = 1;
+	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = pages[i];
+		SetPageUptodate(p);
+		ClearPageChecked(p);
+		set_page_dirty(p);
 	}
 	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
-		if (did_inline)
-			BTRFS_I(inode)->disk_i_size = end_pos;
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	int ret;
 	int testend = 1;
 	unsigned long flags;
+	int compressed = 0;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(em);
 			continue;
 		}
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
 
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->start = em->start;
 			split->len = start - em->start;
 			split->block_start = em->block_start;
+
+			if (compressed)
+				split->block_len = em->block_len;
+			else
+				split->block_len = split->len;
+
 			split->bdev = em->bdev;
 			split->flags = flags;
 			ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 
-			split->block_start = em->block_start + diff;
+			if (compressed) {
+				split->block_len = em->block_len;
+				split->block_start = em->block_start;
+			} else {
+				split->block_len = split->len;
+				split->block_start = em->block_start + diff;
+			}
 
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 			struct btrfs_item *item;
 			item = btrfs_item_nr(leaf, slot);
 			extent_end = found_key.offset +
-			     btrfs_file_extent_inline_len(leaf, item);
+			     btrfs_file_extent_inline_len(leaf, extent);
 			extent_end = (extent_end + root->sectorsize - 1) &
 				~((u64)root->sectorsize -1 );
 		}
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 extent_end = 0;
 	u64 search_start = start;
 	u64 leaf_start;
+	u64 ram_bytes = 0;
+	u8 compression = 0;
+	u8 encryption = 0;
+	u16 other_encoding = 0;
 	u64 root_gen;
 	u64 root_owner;
 	struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int recow;
 	int ret;
 
+	inline_limit = 0;
 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
+			compression = btrfs_file_extent_compression(leaf,
+								    extent);
+			encryption = btrfs_file_extent_encryption(leaf,
+								  extent);
+			other_encoding = btrfs_file_extent_other_encoding(leaf,
+								  extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				extent_end =
 				     btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
 
 				extent_end = key.offset +
 				     btrfs_file_extent_num_bytes(leaf, extent);
+				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+								extent);
 				found_extent = 1;
 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				struct btrfs_item *item;
-				item = btrfs_item_nr(leaf, slot);
 				found_inline = 1;
 				extent_end = key.offset +
-				     btrfs_file_extent_inline_len(leaf, item);
+				     btrfs_file_extent_inline_len(leaf, extent);
 			}
 		} else {
 			extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
-		if (end <= extent_end && start >= key.offset && found_inline) {
+
+		if (end <= extent_end && start >= key.offset && found_inline)
 			*hint_byte = EXTENT_MAP_INLINE;
-			goto out;
-		}
 
 		if (found_extent) {
 			read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
 			write_extent_buffer(leaf, &old,
 					    (unsigned long)extent, sizeof(old));
 
+			btrfs_set_file_extent_compression(leaf, extent,
+							  compression);
+			btrfs_set_file_extent_encryption(leaf, extent,
+							 encryption);
+			btrfs_set_file_extent_other_encoding(leaf, extent,
+							     other_encoding);
 			btrfs_set_file_extent_offset(leaf, extent,
 				    le64_to_cpu(old.offset) + end - key.offset);
 			WARN_ON(le64_to_cpu(old.num_bytes) <
 				(extent_end - end));
 			btrfs_set_file_extent_num_bytes(leaf, extent,
 							extent_end - end);
+
+			/*
+			 * set the ram bytes to the size of the full extent
+			 * before splitting.  This is a worst case flag,
+			 * but its the best we can do because we don't know
+			 * how splitting affects compression
+			 */
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							ram_bytes);
 			btrfs_set_file_extent_type(leaf, extent,
 						   BTRFS_FILE_EXTENT_REG);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d..9797592dc86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
 #include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
+#include "compression.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 };
 
 static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -113,58 +115,375 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	return ret;
 }
 
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
+				u64 start, size_t size, size_t compressed_size,
+				struct page **compressed_pages)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct page *page = NULL;
+	char *kaddr;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	int err = 0;
+	int ret;
+	size_t cur_size = size;
+	size_t datasize;
+	unsigned long offset;
+	int use_compress = 0;
+
+	if (compressed_size && compressed_pages) {
+		use_compress = 1;
+		cur_size = compressed_size;
+	}
+
+	path = btrfs_alloc_path(); if (!path)
+		return -ENOMEM;
+
+	btrfs_set_trans_block_group(trans, inode);
+
+	key.objectid = inode->i_ino;
+	key.offset = start;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	inode_add_bytes(inode, size);
+	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+	inode_add_bytes(inode, size);
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		printk("got bad ret %d\n", ret);
+		goto fail;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	if (use_compress) {
+		struct page *cpage;
+		int i = 0;
+		while(compressed_size > 0) {
+			cpage = compressed_pages[i];
+			cur_size = min(compressed_size,
+				       PAGE_CACHE_SIZE);
+
+			kaddr = kmap(cpage);
+			write_extent_buffer(leaf, kaddr, ptr, cur_size);
+			kunmap(cpage);
+
+			i++;
+			ptr += cur_size;
+			compressed_size -= cur_size;
+		}
+		btrfs_set_file_extent_compression(leaf, ei,
+						  BTRFS_COMPRESS_ZLIB);
+	} else {
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		btrfs_set_file_extent_compression(leaf, ei, 0);
+		kaddr = kmap_atomic(page, KM_USER0);
+		offset = start & (PAGE_CACHE_SIZE - 1);
+		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	btrfs_update_inode(trans, root, inode);
+	return 0;
+fail:
+	btrfs_free_path(path);
+	return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode, u64 start, u64 end,
+				 size_t compressed_size,
+				 struct page **compressed_pages)
+{
+	u64 isize = i_size_read(inode);
+	u64 actual_end = min(end + 1, isize);
+	u64 inline_len = actual_end - start;
+	u64 aligned_end = (end + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	u64 hint_byte;
+	u64 data_len = inline_len;
+	int ret;
+
+	if (compressed_size)
+		data_len = compressed_size;
+
+	if (start > 0 ||
+	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    (!compressed_size &&
+	    (actual_end & (root->sectorsize - 1)) == 0) ||
+	    end + 1 < isize ||
+	    data_len > root->fs_info->max_inline) {
+		return 1;
+	}
+
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 aligned_end, aligned_end, &hint_byte);
+	BUG_ON(ret);
+
+	if (isize > actual_end)
+		inline_len = min_t(u64, isize, actual_end);
+	ret = insert_inline_extent(trans, root, inode, start,
+				   inline_len, compressed_size,
+				   compressed_pages);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+	return 0;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
  * allocate extents on disk for the range, and create ordered data structs
  * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
  */
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+			  u64 start, u64 end, int *page_started)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
+	unsigned long ram_size;
+	u64 orig_start;
+	u64 disk_num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 orig_num_bytes;
+	u64 actual_end;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
+	struct page **pages = NULL;
+	unsigned long nr_pages;
+	unsigned long nr_pages_ret = 0;
+	unsigned long total_compressed = 0;
+	unsigned long total_in = 0;
+	unsigned long max_compressed = 128 * 1024;
+	unsigned long max_uncompressed = 256 * 1024;
+	int i;
+	int will_compress;
 
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	orig_start = start;
+
+	/*
+	 * compression made this loop a bit ugly, but the basic idea is to
+	 * compress some pages but keep the total size of the compressed
+	 * extent relatively small.  If compression is off, this goto target
+	 * is never used.
+	 */
+again:
+	will_compress = 0;
+	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+	actual_end = min_t(u64, i_size_read(inode), end + 1);
+	total_compressed = actual_end - start;
+
+	/* we want to make sure that amount of ram required to uncompress
+	 * an extent is reasonable, so we limit the total size in ram
+	 * of a compressed extent to 256k
+	 */
+	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	orig_num_bytes = num_bytes;
+	disk_num_bytes = num_bytes;
+	total_in = 0;
+	ret = 0;
 
-	if (alloc_hint == EXTENT_MAP_INLINE)
-		goto out;
+	/* we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress
+	 */
+	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+	    btrfs_test_opt(root, COMPRESS)) {
+		WARN_ON(pages);
+		pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+		/* we want to make sure the amount of IO required to satisfy
+		 * a random read is reasonably small, so we limit the size
+		 * of a compressed extent to 128k
+		 */
+		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+						total_compressed, pages,
+						nr_pages, &nr_pages_ret,
+						&total_in,
+						&total_compressed,
+						max_compressed);
+
+		if (!ret) {
+			unsigned long offset = total_compressed &
+				(PAGE_CACHE_SIZE - 1);
+			struct page *page = pages[nr_pages_ret - 1];
+			char *kaddr;
+
+			/* zero the tail end of the last page, we might be
+			 * sending it down to disk
+			 */
+			if (offset) {
+				kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + offset, 0,
+				       PAGE_CACHE_SIZE - offset);
+				kunmap_atomic(kaddr, KM_USER0);
+			}
+			will_compress = 1;
+		}
+	}
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		if (ret || total_in < (end - start + 1)) {
+			/* we didn't compress the entire range, try
+			 * to make an uncompressed inline extent.  This
+			 * is almost sure to fail, but maybe inline sizes
+			 * will get bigger later
+			 */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end, 0, NULL);
+		} else {
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end,
+						    total_compressed, pages);
+		}
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL,
+						     1, 1, 1);
+			*page_started = 1;
+			ret = 0;
+			goto free_pages_out;
+		}
+	}
+
+	if (will_compress) {
+		/*
+		 * we aren't doing an inline extent round the compressed size
+		 * up to a block size boundary so the allocator does sane
+		 * things
+		 */
+		total_compressed = (total_compressed + blocksize - 1) &
+			~(blocksize - 1);
+
+		/*
+		 * one last check to make sure the compression is really a
+		 * win, compare the page count read with the blocks on disk
+		 */
+		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+			~(PAGE_CACHE_SIZE - 1);
+		if (total_compressed >= total_in) {
+			will_compress = 0;
+		} else {
+			disk_num_bytes = total_compressed;
+			num_bytes = total_in;
+		}
+	}
+	if (!will_compress && pages) {
+		/*
+		 * the compression code ran but failed to make things smaller,
+		 * free any pages it allocated and our page pointer array
+		 */
+		for (i = 0; i < nr_pages_ret; i++) {
+			page_cache_release(pages[i]);
+		}
+		kfree(pages);
+		pages = NULL;
+		total_compressed = 0;
+		nr_pages_ret = 0;
+
+		/* flag the file so we don't compress in the future */
+		btrfs_set_flag(inode, NOCOMPRESS);
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
-	while(num_bytes > 0) {
-		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+	while(disk_num_bytes > 0) {
+		unsigned long min_bytes;
+
+		/*
+		 * the max size of a compressed extent is pretty small,
+		 * make the code a little less complex by forcing
+		 * the allocator to find a whole compressed extent at once
+		 */
+		if (will_compress)
+			min_bytes = disk_num_bytes;
+		else
+			min_bytes = root->sectorsize;
+
+		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   root->sectorsize, 0, alloc_hint,
+					   min_bytes, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
-			goto out;
+			goto free_pages_out_fail;
 		}
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
-		em->len = ins.offset;
+
+		if (will_compress) {
+			ram_size = num_bytes;
+			em->len = num_bytes;
+		} else {
+			/* ramsize == disk size */
+			ram_size = ins.offset;
+			em->len = ins.offset;
+		}
+
 		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+
 		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+		if (will_compress)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
 		while(1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 				break;
 			}
 			btrfs_drop_extent_cache(inode, start,
-						start + ins.offset - 1, 0);
+						start + ram_size - 1, 0);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ins.offset, 0);
+					       ram_size, cur_alloc_size, 0,
+					       will_compress);
 		BUG_ON(ret);
-		if (num_bytes < cur_alloc_size) {
-			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+
+		if (disk_num_bytes < cur_alloc_size) {
+			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
 			       cur_alloc_size);
 			break;
 		}
+
+		if (will_compress) {
+			/*
+			 * we're doing compression, we and we need to
+			 * submit the compressed extents down to the device.
+			 *
+			 * We lock down all the file pages, clearing their
+			 * dirty bits and setting them writeback.  Everyone
+			 * that wants to modify the page will wait on the
+			 * ordered extent above.
+			 *
+			 * The writeback bits on the file pages are
+			 * cleared when the compressed pages are on disk
+			 */
+			btrfs_end_transaction(trans, root);
+
+			if (start <= page_offset(locked_page) &&
+			    page_offset(locked_page) < start + ram_size) {
+				*page_started = 1;
+			}
+
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start,
+						     start + ram_size - 1,
+						     NULL, 1, 1, 0);
+
+			ret = btrfs_submit_compressed_write(inode, start,
+						 ram_size, ins.objectid,
+						 cur_alloc_size, pages,
+						 nr_pages_ret);
+
+			BUG_ON(ret);
+			trans = btrfs_join_transaction(root, 1);
+			if (start + ram_size < end) {
+				start += ram_size;
+				alloc_hint = ins.objectid + ins.offset;
+				/* pages will be freed at end_bio time */
+				pages = NULL;
+				goto again;
+			} else {
+				/* we've written everything, time to go */
+				break;
+			}
+		}
+		/* we're not doing compressed IO, don't unlock the first
+		 * page (which the caller expects to stay locked), don't
+		 * clear any dirty bits and don't set any writeback bits
+		 */
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					     start, start + ram_size - 1,
+					     locked_page, 0, 0, 0);
+		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+
+	ret = 0;
 out:
 	btrfs_end_transaction(trans, root);
+
 	return ret;
+
+free_pages_out_fail:
+	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+				     start, end, locked_page, 0, 0, 0);
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++)
+		page_cache_release(pages[i]);
+	if (pages)
+		kfree(pages);
+
+	goto out;
 }
 
 /*
@@ -203,7 +591,8 @@ out:
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started)
 {
 	u64 extent_start;
 	u64 extent_end;
@@ -260,6 +649,11 @@ again:
 		extent_end = extent_start + extent_num_bytes;
 		err = 0;
 
+		if (btrfs_file_extent_compression(leaf, item) ||
+		    btrfs_file_extent_encryption(leaf,item) ||
+		    btrfs_file_extent_other_encoding(leaf, item))
+			goto not_found;
+
 		if (loops && start != extent_start)
 			goto not_found;
 
@@ -284,7 +678,8 @@ again:
 		bytenr += btrfs_file_extent_offset(leaf, item);
 		extent_num_bytes = min(end + 1, extent_end) - start;
 		ret = btrfs_add_ordered_extent(inode, start, bytenr,
-						extent_num_bytes, 1);
+						extent_num_bytes,
+						extent_num_bytes, 1, 0);
 		if (ret) {
 			err = ret;
 			goto out;
@@ -300,7 +695,8 @@ again:
 not_found:
 		btrfs_end_transaction(trans, root);
 		btrfs_free_path(path);
-		return cow_file_range(inode, start, end);
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started);
 	}
 out:
 	WARN_ON(err);
@@ -312,16 +708,19 @@ out:
 /*
  * extent_io.c call back to do delayed allocation processing
  */
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
-		ret = run_delalloc_nocow(inode, start, end);
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started);
 	else
-		ret = cow_file_range(inode, start, end);
+		ret = cow_file_range(inode, locked_page, start, end,
+				     page_started);
 
 	return ret;
 }
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
  * we don't create bios that span stripes or chunks
  */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio)
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * are inserted into the btree
  */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num)
+			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
  * or reading the csums from the tree before a read
  */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num)
+			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	if (!(rw & (1 << BIO_RW))) {
 		btrfs_lookup_bio_sums(root, inode, bio);
+
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+			return btrfs_submit_compressed_read(inode, bio,
+						    mirror_num, bio_flags);
+		}
+
 		goto mapit;
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
-				   __btrfs_submit_bio_hook);
+				   bio_flags, __btrfs_submit_bio_hook);
 mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
@@ -539,7 +945,7 @@ out_page:
  * good idea.  This causes problems because we want to make sure COW
  * properly happens and the data=ordered rules are followed.
  *
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
  * hasn't been properly setup for IO.  We kick off an async process
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
 					  ordered_extent->start);
 	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-					     ordered_extent->len);
+					     ordered_extent->disk_len);
 	btrfs_set_file_extent_offset(leaf, extent_item, 0);
+
+	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+		btrfs_set_file_extent_compression(leaf, extent_item, 1);
+	else
+		btrfs_set_file_extent_compression(leaf, extent_item, 0);
+	btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+
+	/* ram bytes = extent_num_bytes for now */
 	btrfs_set_file_extent_num_bytes(leaf, extent_item,
 					ordered_extent->len);
+	btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+					ordered_extent->len);
 	btrfs_mark_buffer_dirty(leaf);
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->len;
+	ins.offset = ordered_extent->disk_len;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
 					  root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 	int ret;
 	int rw;
 	u64 logical;
+	unsigned long bio_flags = 0;
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 		}
 		logical = start - em->start;
 		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			bio_flags = EXTENT_BIO_COMPRESSED;
 		failrec->logical = logical;
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 		rw = READ;
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror);
+						      failrec->last_mirror,
+						      bio_flags);
 	return 0;
 }
 
@@ -1644,10 +2065,8 @@ search_again:
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-				struct btrfs_item *item = btrfs_item_nr(leaf,
-							        path->slots[0]);
 				item_end += btrfs_file_extent_inline_len(leaf,
-									 item);
+									 fi);
 			}
 			item_end--;
 		}
@@ -1715,7 +2134,14 @@ search_again:
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-			if (!del_item) {
+			/*
+			 * we can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_compression(leaf, fi) == 0 &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
 				u32 size = new_size - found_key.offset;
 
 				if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       hole_start, 0, 0,
-						       hole_size, 0);
+						       hole_size, 0, hole_size,
+						       0, 0, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 						(u64)-1, 0);
 			btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 	start_diff = map_start - em->start;
 	em->start = map_start;
 	em->len = map_len;
-	if (em->block_start < EXTENT_MAP_LAST_BYTE)
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 		em->block_start += start_diff;
+		em->block_len -= start_diff;
+	}
 	return add_extent_mapping(em_tree, em);
 }
 
+static noinline int uncompress_inline(struct btrfs_path *path,
+				      struct inode *inode, struct page *page,
+				      size_t pg_offset, u64 extent_offset,
+				      struct btrfs_file_extent_item *item)
+{
+	int ret;
+	struct extent_buffer *leaf = path->nodes[0];
+	char *tmp;
+	size_t max_size;
+	unsigned long inline_size;
+	unsigned long ptr;
+
+	WARN_ON(pg_offset != 0);
+	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+	inline_size = btrfs_file_extent_inline_item_len(leaf,
+					btrfs_item_nr(leaf, path->slots[0]));
+	tmp = kmalloc(inline_size, GFP_NOFS);
+	ptr = btrfs_file_extent_inline_start(item);
+
+	read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+	max_size = min(PAGE_CACHE_SIZE, max_size);
+	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+				    inline_size, max_size);
+	if (ret) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		unsigned long copy_size = min_t(u64,
+				  PAGE_CACHE_SIZE - pg_offset,
+				  max_size - extent_offset);
+		memset(kaddr + pg_offset, 0, copy_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	kfree(tmp);
+	return 0;
+}
+
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
  * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
+	int compressed;
 
 again:
 	spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
+	em->block_len = (u64)-1;
 
 	if (!path) {
 		path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
+	compressed = btrfs_file_extent_compression(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
 			em->block_start = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		bytenr += btrfs_file_extent_offset(leaf, item);
-		em->block_start = bytenr;
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
+		if (compressed) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->block_start = bytenr;
+			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+									 item);
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, item);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		u64 page_start;
@@ -3018,8 +3495,7 @@ again:
 		size_t extent_offset;
 		size_t copy_size;
 
-		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
-						    path->slots[0]));
+		size = btrfs_file_extent_inline_len(leaf, item);
 		extent_end = (extent_start + size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
 		}
 		em->block_start = EXTENT_MAP_INLINE;
 
-		if (!page) {
+		if (!page || create) {
 			em->start = extent_start;
-			em->len = size;
+			em->len = (size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 			goto out;
 		}
 
@@ -3048,11 +3525,22 @@ again:
 		em->start = extent_start + extent_offset;
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
-		map = kmap(page);
+		if (compressed)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
-			read_extent_buffer(leaf, map + pg_offset, ptr,
-					   copy_size);
+			if (btrfs_file_extent_compression(leaf, item) ==
+			    BTRFS_COMPRESS_ZLIB) {
+				ret = uncompress_inline(path, inode, page,
+							pg_offset,
+							extent_offset, item);
+				BUG_ON(ret);
+			} else {
+				map = kmap(page);
+				read_extent_buffer(leaf, map + pg_offset, ptr,
+						   copy_size);
+				kunmap(page);
+			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
 			if (!trans) {
@@ -3063,11 +3551,12 @@ again:
 				trans = btrfs_join_transaction(root, 1);
 				goto again;
 			}
+			map = kmap(page);
 			write_extent_buffer(leaf, map + pg_offset, ptr,
 					    copy_size);
+			kunmap(page);
 			btrfs_mark_buffer_dirty(leaf);
 		}
-		kunmap(page);
 		set_extent_uptodate(io_tree, em->start,
 				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 	btrfs_set_file_extent_type(leaf, ei,
 				   BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_compression(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
 	ptr = btrfs_file_extent_inline_start(ei);
 	write_extent_buffer(leaf, symname, ptr, name_len);
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c..b5745bb96d4 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * inserted.
  */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow)
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	entry->disk_len = disk_len;
 	entry->inode = inode;
 	if (nocow)
 		set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+	if (compressed)
+		set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
 
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * for pdflush to find them
 	 */
 	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-	if (wait)
+	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
+	}
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a14..1ef464145d2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
 	/* disk byte number */
 	u64 start;
 
-	/* length of the extent in bytes */
+	/* ram length of the extent in bytes */
 	u64 len;
 
+	/* extent length on disk */
+	u64 disk_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow);
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f..64725c13aa1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
 				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l, item));
+			           btrfs_file_extent_inline_len(l, fi));
 				break;
 			}
 			printk("\t\textent data disk bytenr %llu nr %llu\n",
 			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
 			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-			printk("\t\textent data offset %llu nr %llu\n",
+			printk("\t\textent data offset %llu nr %llu ram %llu\n",
 			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7..431fdf144b5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
 #include "volumes.h"
 #include "version.h"
 #include "export.h"
+#include "compression.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
 	{Opt_noacl, "noacl"},
 	{Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
+		case Opt_compress:
+			printk(KERN_INFO "btrfs: use compression\n");
+			btrfs_set_opt(info->mount_opt, COMPRESS);
+			break;
 		case Opt_ssd:
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_interface_init();
 	if (err)
 		goto free_extent_map;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
+	btrfs_zlib_exit();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34..e6d579053a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	if (found_type == BTRFS_FILE_EXTENT_REG)
 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		size = btrfs_file_extent_inline_len(eb,
-						    btrfs_item_nr(eb, slot));
+		size = btrfs_file_extent_inline_len(eb, item);
 		extent_end = (start + size + mask) & ~mask;
 	} else {
 		ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51..7db4cfd03a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
 	em->start = key.offset;
 	em->len = *num_bytes;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->start = logical;
 	em->len = length;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000000..e99309180a1
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+	avail_out = *dstlen - 12 and flush == Z_FINISH.
+	If it doesn't manage to finish,	call it again with
+	avail_in == 0 and avail_out set to the remaining 12
+	bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+	z_stream inf_strm;
+	z_stream def_strm;
+	char *buf;
+	struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+	struct workspace *workspace;
+	int ret;
+	int cpus = num_online_cpus();
+
+again:
+	spin_lock(&workspace_lock);
+	if (!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		num_workspace--;
+		spin_unlock(&workspace_lock);
+		return workspace;
+
+	}
+	spin_unlock(&workspace_lock);
+	if (atomic_read(&alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&alloc_workspace) > cpus)
+			schedule();
+		finish_wait(&workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(&alloc_workspace);
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+	if (!workspace->def_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	if (!workspace->inf_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail_inflate;
+	}
+	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	if (!workspace->buf) {
+		ret = -ENOMEM;
+		goto fail_kmalloc;
+	}
+	return workspace;
+
+fail_kmalloc:
+	vfree(workspace->inf_strm.workspace);
+fail_inflate:
+	vfree(workspace->def_strm.workspace);
+fail:
+	kfree(workspace);
+	atomic_dec(&alloc_workspace);
+	wake_up(&workspace_wait);
+	return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+	spin_lock(&workspace_lock);
+	if (num_workspace < num_online_cpus()) {
+		list_add_tail(&workspace->list, &idle_workspace);
+		num_workspace++;
+		spin_unlock(&workspace_lock);
+		if (waitqueue_active(&workspace_wait))
+			wake_up(&workspace_wait);
+		return 0;
+	}
+	spin_unlock(&workspace_lock);
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+
+	atomic_dec(&alloc_workspace);
+	if (waitqueue_active(&workspace_wait))
+		wake_up(&workspace_wait);
+	return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct workspace *workspace;
+	while(!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		vfree(workspace->def_strm.workspace);
+		vfree(workspace->inf_strm.workspace);
+		kfree(workspace->buf);
+		kfree(workspace);
+		atomic_dec(&alloc_workspace);
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	int ret;
+	struct workspace *workspace;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	int out_written = 0;
+	int in_read = 0;
+	unsigned long bytes_left;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -1;
+
+	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+		printk(KERN_WARNING "deflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	workspace->def_strm.total_in = 0;
+	workspace->def_strm.total_out = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	cpage_out = kmap(out_page);
+	pages[0] = out_page;
+	nr_pages = 1;
+
+	workspace->def_strm.next_in = data_in;
+	workspace->def_strm.next_out = cpage_out;
+	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+	out_written = 0;
+	in_read = 0;
+
+	while (workspace->def_strm.total_in < len) {
+		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+		if (ret != Z_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			zlib_deflateEnd(&workspace->def_strm);
+			ret = -1;
+			goto out;
+		}
+
+		/* we're making it bigger, give up */
+		if (workspace->def_strm.total_in > 8192 &&
+		    workspace->def_strm.total_in <
+		    workspace->def_strm.total_out) {
+			ret = -1;
+			goto out;
+		}
+		/* we need another page for writing out.  Test this
+		 * before the total_in so we will pull in a new page for
+		 * the stream end if required
+		 */
+		if (workspace->def_strm.avail_out == 0) {
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -1;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			cpage_out = kmap(out_page);
+			pages[nr_pages] = out_page;
+			nr_pages++;
+			workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->def_strm.next_out = cpage_out;
+		}
+		/* we're all done */
+		if (workspace->def_strm.total_in >= len)
+			break;
+
+		/* we've read in a full page, get a new one */
+		if (workspace->def_strm.avail_in == 0) {
+			if (workspace->def_strm.total_out > max_out)
+				break;
+
+			bytes_left = len - workspace->def_strm.total_in;
+			kunmap(in_page);
+			page_cache_release(in_page);
+
+			start += PAGE_CACHE_SIZE;
+			in_page = find_get_page(mapping,
+						start >> PAGE_CACHE_SHIFT);
+			data_in = kmap(in_page);
+			workspace->def_strm.avail_in = min(bytes_left,
+							   PAGE_CACHE_SIZE);
+			workspace->def_strm.next_in = data_in;
+		}
+	}
+	workspace->def_strm.avail_in = 0;
+	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+	zlib_deflateEnd(&workspace->def_strm);
+
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+		goto out;
+	}
+
+	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+		ret = -1;
+		goto out;
+	}
+
+	ret = 0;
+	*total_out = workspace->def_strm.total_out;
+	*total_in = workspace->def_strm.total_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	char *data_in;
+	size_t total_out = 0;
+	unsigned long page_bytes_left;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	struct page *page_out;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+	unsigned long start_byte;
+	unsigned long current_buf_start;
+	char *kaddr;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	data_in = kmap(pages_in[page_in_index]);
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.total_out = 0;
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	page_out = bvec[page_out_index].bv_page;
+	page_bytes_left = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+	while(workspace->inf_strm.total_in < srclen) {
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END) {
+			break;
+		}
+
+		/*
+		 * buf start is the byte offset we're of the start of
+		 * our workspace buffer
+		 */
+		buf_start = total_out;
+
+		/* total_out is the last byte of the workspace buffer */
+		total_out = workspace->inf_strm.total_out;
+
+		working_bytes = total_out - buf_start;
+
+		/*
+		 * start byte is the first byte of the page we're currently
+		 * copying into relative to the start of the compressed data.
+		 */
+		start_byte = page_offset(page_out) - disk_start;
+
+		if (working_bytes == 0) {
+			/* we didn't make progress in this inflate
+			 * call, we're done
+			 */
+			if (ret != Z_STREAM_END)
+				ret = -1;
+			break;
+		}
+
+		/* we haven't yet hit data corresponding to this page */
+		if (total_out <= start_byte) {
+			goto next;
+		}
+
+		/*
+		 * the start of the data we care about is offset into
+		 * the middle of our working buffer
+		 */
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+			working_bytes -= buf_offset;
+		} else {
+			buf_offset = 0;
+		}
+		current_buf_start = buf_start;
+
+		/* copy bytes from the working buffer into the pages */
+		while(working_bytes > 0) {
+			bytes = min(PAGE_CACHE_SIZE - pg_offset,
+				    PAGE_CACHE_SIZE - buf_offset);
+			bytes = min(bytes, working_bytes);
+			kaddr = kmap_atomic(page_out, KM_USER0);
+			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+			       bytes);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page_out);
+
+			pg_offset += bytes;
+			page_bytes_left -= bytes;
+			buf_offset += bytes;
+			working_bytes -= bytes;
+			current_buf_start += bytes;
+
+			/* check if we need to pick another page */
+			if (page_bytes_left == 0) {
+				page_out_index++;
+				if (page_out_index >= vcnt) {
+					ret = 0;
+					goto done;
+				}
+				page_out = bvec[page_out_index].bv_page;
+				pg_offset = 0;
+				page_bytes_left = PAGE_CACHE_SIZE;
+				start_byte = page_offset(page_out) - disk_start;
+
+				/*
+				 * make sure our new page is covered by this
+				 * working buffer
+				 */
+				if (total_out <= start_byte) {
+					goto next;
+				}
+
+				/* the next page in the biovec might not
+				 * be adjacent to the last page, but it
+				 * might still be found inside this working
+				 * buffer.  bump our offset pointer
+				 */
+				if (total_out > start_byte &&
+				    current_buf_start < start_byte) {
+					buf_offset = start_byte - buf_start;
+					working_bytes = total_out - start_byte;
+					current_buf_start = buf_start +
+						buf_offset;
+				}
+			}
+		}
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+		if (workspace->inf_strm.avail_in == 0) {
+			unsigned long tmp;
+			kunmap(pages_in[page_in_index]);
+			page_in_index++;
+			if (page_in_index >= total_pages_in) {
+				data_in = NULL;
+				break;
+			}
+			data_in = kmap(pages_in[page_in_index]);
+			workspace->inf_strm.next_in = data_in;
+			tmp = srclen - workspace->inf_strm.total_in;
+			workspace->inf_strm.avail_in = min(tmp,
+							   PAGE_CACHE_SIZE);
+		}
+	}
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+done:
+	zlib_inflateEnd(&workspace->inf_strm);
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	unsigned long bytes_left = destlen;
+	unsigned long total_out = 0;
+	char *kaddr;
+
+	if (destlen > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = srclen;
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->inf_strm.total_out = 0;
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	while(bytes_left > 0) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+		unsigned long pg_offset = 0;
+
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END) {
+			break;
+		}
+
+		buf_start = total_out;
+		total_out = workspace->inf_strm.total_out;
+
+		if (total_out == buf_start) {
+			ret = -1;
+			break;
+		}
+
+		if (total_out <= start_byte) {
+			goto next;
+		}
+
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+		} else {
+			buf_offset = 0;
+		}
+
+		bytes = min(PAGE_CACHE_SIZE - pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, bytes_left);
+
+		kaddr = kmap_atomic(dest_page, KM_USER0);
+		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		pg_offset += bytes;
+		bytes_left -= bytes;
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	}
+	if (ret != Z_STREAM_END && bytes_left != 0) {
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+	zlib_inflateEnd(&workspace->inf_strm);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
-- 
cgit v1.2.3-70-g09d2


From 2517920135b0d29e70453e5b03d70d7b94207df3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: nuke fs wide allocation mutex V2

This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.

There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.

The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same.  Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.

Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one.  I have tested this heavily and it does
not appear to break anything.  This has to be applied on top of my
find_free_extent redo patch.

I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.c            |   3 +-
 fs/btrfs/ctree.h            |   8 +-
 fs/btrfs/disk-io.c          |   3 +-
 fs/btrfs/extent-tree.c      | 333 +++++++++++++++++++++-----------------------
 fs/btrfs/extent_io.c        |  14 ++
 fs/btrfs/extent_io.h        |   2 +
 fs/btrfs/free-space-cache.c |  92 ++++++++----
 fs/btrfs/transaction.c      |   2 -
 fs/btrfs/tree-defrag.c      |   3 -
 fs/btrfs/tree-log.c         |   4 +-
 fs/btrfs/volumes.c          |   2 -
 11 files changed, 250 insertions(+), 216 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 73899d0f9d8..f82f8db0227 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1387,8 +1387,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
-	WARN_ON(cow && root == root->fs_info->extent_root &&
-		!mutex_is_locked(&root->fs_info->alloc_mutex));
+
 	if (ins_len < 0)
 		lowest_unlock = 2;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index caa860a1c3e..fdba4f1b634 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -558,6 +558,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	spinlock_t lock;
+	struct mutex alloc_mutex;
 	u64 pinned;
 	u64 reserved;
 	u64 flags;
@@ -635,7 +636,8 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
-	struct mutex alloc_mutex;
+	struct mutex extent_ins_mutex;
+	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
@@ -1941,8 +1943,12 @@ int btrfs_acl_chmod(struct inode *inode);
 /* free-space-cache.c */
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
 				   *block_group);
 struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 796256440df..d1137d7ea8d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1460,7 +1460,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
-	mutex_init(&fs_info->alloc_mutex);
+	mutex_init(&fs_info->extent_ins_mutex);
+	mutex_init(&fs_info->pinned_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e3b3e13a481..564260872c7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -53,24 +53,6 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			 struct btrfs_block_group_cache *hint,
 			 u64 search_start, int data, int owner);
 
-void maybe_lock_mutex(struct btrfs_root *root)
-{
-	if (root != root->fs_info->extent_root &&
-	    root != root->fs_info->chunk_root &&
-	    root != root->fs_info->dev_root) {
-		mutex_lock(&root->fs_info->alloc_mutex);
-	}
-}
-
-void maybe_unlock_mutex(struct btrfs_root *root)
-{
-	if (root != root->fs_info->extent_root &&
-	    root != root->fs_info->chunk_root &&
-	    root != root->fs_info->dev_root) {
-		mutex_unlock(&root->fs_info->alloc_mutex);
-	}
-}
-
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -164,6 +146,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	u64 extent_start, extent_end, size;
 	int ret;
 
+	mutex_lock(&info->pinned_mutex);
 	while (start < end) {
 		ret = find_first_extent_bit(&info->pinned_extents, start,
 					    &extent_start, &extent_end,
@@ -175,7 +158,8 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
-			ret = btrfs_add_free_space(block_group, start, size);
+			ret = btrfs_add_free_space_lock(block_group, start,
+							size);
 			BUG_ON(ret);
 			start = extent_end + 1;
 		} else {
@@ -185,9 +169,10 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (start < end) {
 		size = end - start;
-		ret = btrfs_add_free_space(block_group, start, size);
+		ret = btrfs_add_free_space_lock(block_group, start, size);
 		BUG_ON(ret);
 	}
+	mutex_unlock(&info->pinned_mutex);
 
 	return 0;
 }
@@ -445,13 +430,11 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	maybe_lock_mutex(root);
 	key.objectid = start;
 	key.offset = len;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
-	maybe_unlock_mutex(root);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -676,8 +659,9 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 
 		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
 		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
 			u64 priv;
 			ret = get_state_private(&root->fs_info->extent_ins,
 						bytenr, &priv);
@@ -686,6 +670,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 							(unsigned long)priv;
 			BUG_ON(extent_op->parent != orig_parent);
 			BUG_ON(extent_op->generation != orig_generation);
+
 			extent_op->parent = parent;
 			extent_op->generation = ref_generation;
 		} else {
@@ -703,10 +688,11 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 
 			set_extent_bits(&root->fs_info->extent_ins,
 					bytenr, bytenr + num_bytes - 1,
-					EXTENT_LOCKED, GFP_NOFS);
+					EXTENT_WRITEBACK, GFP_NOFS);
 			set_state_private(&root->fs_info->extent_ins,
 					  bytenr, (unsigned long)extent_op);
 		}
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		return 0;
 	}
 
@@ -742,12 +728,10 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
 					parent, ref_root, ref_root,
 					ref_generation, ref_generation,
 					owner_objectid);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -817,11 +801,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
 				     0, ref_root, 0, ref_generation,
 				     owner_objectid);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -886,7 +868,6 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
 	path = btrfs_alloc_path();
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -953,7 +934,6 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	}
 	ret = 0;
 out:
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1179,13 +1159,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			nr_file_extents++;
 
-			maybe_lock_mutex(root);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   key.objectid);
-			maybe_unlock_mutex(root);
 
 			if (ret) {
 				faili = i;
@@ -1194,13 +1172,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
-			maybe_lock_mutex(root);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   level - 1);
-			maybe_unlock_mutex(root);
 			if (ret) {
 				faili = i;
 				WARN_ON(1);
@@ -1270,24 +1246,20 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (bytenr == 0)
 				continue;
-			maybe_lock_mutex(root);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    key.objectid);
-			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, slot);
-			maybe_lock_mutex(root);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    level - 1);
-			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
 		}
@@ -1344,7 +1316,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		cache = NULL;
 		spin_lock(&root->fs_info->block_group_cache_lock);
@@ -1378,7 +1349,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		}
 	}
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
@@ -1390,9 +1360,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 	found = __find_space_info(info, flags);
 	if (found) {
+		spin_lock(&found->lock);
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
 		found->full = 0;
+		spin_unlock(&found->lock);
 		*space_info = found;
 		return 0;
 	}
@@ -1479,43 +1451,53 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(!space_info);
 
+	spin_lock(&space_info->lock);
 	if (space_info->force_alloc) {
 		force = 1;
 		space_info->force_alloc = 0;
 	}
-	if (space_info->full)
+	if (space_info->full) {
+		spin_unlock(&space_info->lock);
 		goto out;
+	}
 
 	thresh = div_factor(space_info->total_bytes, 6);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned +
-	    space_info->bytes_reserved + alloc_bytes) < thresh)
+	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+		spin_unlock(&space_info->lock);
 		goto out;
+	}
 
-	while (!mutex_trylock(&extent_root->fs_info->chunk_mutex)) {
-		if (!force)
-			goto out;
-		mutex_unlock(&extent_root->fs_info->alloc_mutex);
-		cond_resched();
-		mutex_lock(&extent_root->fs_info->alloc_mutex);
+	spin_unlock(&space_info->lock);
+
+	ret = mutex_trylock(&extent_root->fs_info->chunk_mutex);
+	if (!ret && !force) {
+		goto out;
+	} else if (!ret) {
+		mutex_lock(&extent_root->fs_info->chunk_mutex);
 		waited = 1;
 	}
 
-	if (waited && space_info->full)
-		goto out_unlock;
+	if (waited) {
+		spin_lock(&space_info->lock);
+		if (space_info->full) {
+			spin_unlock(&space_info->lock);
+			goto out_unlock;
+		}
+		spin_unlock(&space_info->lock);
+	}
 
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
-	if (ret == -ENOSPC) {
+	if (ret) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
 		goto out_unlock;
 	}
-	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-
 out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
@@ -1533,7 +1515,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {
@@ -1542,6 +1523,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
+		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
@@ -1551,11 +1533,13 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			cache->space_info->bytes_used += num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 		} else {
 			old_val -= num_bytes;
 			cache->space_info->bytes_used -= num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			if (mark_free) {
 				int ret;
 				ret = btrfs_add_free_space(cache, bytenr,
@@ -1588,7 +1572,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
+	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
 	if (pin) {
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
@@ -1602,16 +1586,20 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 		len = min(num, cache->key.offset -
 			  (bytenr - cache->key.objectid));
 		if (pin) {
+			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			cache->pinned += len;
 			cache->space_info->bytes_pinned += len;
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned += len;
 		} else {
+			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			cache->pinned -= len;
 			cache->space_info->bytes_pinned -= len;
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1627,23 +1615,23 @@ static int update_reserved_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
 		BUG_ON(!cache);
 		len = min(num, cache->key.offset -
 			  (bytenr - cache->key.objectid));
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
 		if (reserve) {
-			spin_lock(&cache->lock);
 			cache->reserved += len;
 			cache->space_info->bytes_reserved += len;
-			spin_unlock(&cache->lock);
 		} else {
-			spin_lock(&cache->lock);
 			cache->reserved -= len;
 			cache->space_info->bytes_reserved -= len;
-			spin_unlock(&cache->lock);
 		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&cache->space_info->lock);
 		bytenr += len;
 		num -= len;
 	}
@@ -1658,6 +1646,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
@@ -1666,6 +1655,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 		set_extent_dirty(copy, start, end, GFP_NOFS);
 		last = end + 1;
 	}
+	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -1678,7 +1668,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group_cache *cache;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -1690,12 +1680,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		if (cache->cached)
 			btrfs_add_free_space(cache, start, end - start + 1);
 		if (need_resched()) {
-			mutex_unlock(&root->fs_info->alloc_mutex);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
+			mutex_lock(&root->fs_info->pinned_mutex);
 		}
 	}
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -1705,6 +1695,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	u64 priv;
+	u64 search = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct btrfs_extent_ref *ref;
@@ -1714,20 +1705,37 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	path = btrfs_alloc_path();
 
 	while(1) {
-		ret = find_first_extent_bit(&info->extent_ins, 0, &start,
-					    &end, EXTENT_LOCKED);
-		if (ret)
+		mutex_lock(&info->extent_ins_mutex);
+		ret = find_first_extent_bit(&info->extent_ins, search, &start,
+					    &end, EXTENT_WRITEBACK);
+		if (ret) {
+			mutex_unlock(&info->extent_ins_mutex);
+			if (search) {
+				search = 0;
+				continue;
+			}
 			break;
+		}
+
+		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			continue;
+		}
+		BUG_ON(ret < 0);
 
 		ret = get_state_private(&info->extent_ins, start, &priv);
 		BUG_ON(ret);
 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
 
+		mutex_unlock(&info->extent_ins_mutex);
+
 		if (extent_op->type == PENDING_EXTENT_INSERT) {
 			key.objectid = start;
 			key.offset = end + 1 - start;
@@ -1736,8 +1744,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					&extent_item, sizeof(extent_item));
 			BUG_ON(err);
 
+			mutex_lock(&info->extent_ins_mutex);
 			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+					  EXTENT_WRITEBACK, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			err = insert_extent_backref(trans, extent_root, path,
 						start, extent_op->parent,
@@ -1753,8 +1763,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 						extent_op->level, 0);
 			BUG_ON(err);
 
+			mutex_lock(&info->extent_ins_mutex);
 			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+					  EXTENT_WRITEBACK, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			key.objectid = start;
 			key.offset = extent_op->parent;
@@ -1772,12 +1784,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 			BUG_ON(1);
 		}
 		kfree(extent_op);
+		unlock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		search = 0;
 
-		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
+		cond_resched();
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -1790,7 +1800,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	int err = 0;
 	struct extent_buffer *buf;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (is_data)
 		goto pinit;
 
@@ -1847,7 +1856,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -1935,8 +1943,10 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 #endif
 
 		if (pin) {
+			mutex_lock(&root->fs_info->pinned_mutex);
 			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1956,6 +1966,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
@@ -1994,70 +2005,91 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 {
 	int ret;
 	int err = 0;
-	int mark_free = 0;
 	u64 start;
 	u64 end;
 	u64 priv;
+	u64 search = 0;
 	struct extent_io_tree *pending_del;
 	struct extent_io_tree *extent_ins;
 	struct pending_extent_op *extent_op;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	extent_ins = &extent_root->fs_info->extent_ins;
 	pending_del = &extent_root->fs_info->pending_del;
 
 	while(1) {
-		ret = find_first_extent_bit(pending_del, 0, &start, &end,
-					    EXTENT_LOCKED);
-		if (ret)
+		mutex_lock(&info->extent_ins_mutex);
+		ret = find_first_extent_bit(pending_del, search, &start, &end,
+					    EXTENT_WRITEBACK);
+		if (ret) {
+			mutex_unlock(&info->extent_ins_mutex);
+			if (search) {
+				search = 0;
+				continue;
+			}
 			break;
+		}
+
+		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			continue;
+		}
+		BUG_ON(ret < 0);
 
 		ret = get_state_private(pending_del, start, &priv);
 		BUG_ON(ret);
 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
 
-		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
+		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
 				  GFP_NOFS);
-
-		ret = pin_down_bytes(trans, extent_root, start,
-				     end + 1 - start, 0);
-		mark_free = ret > 0;
 		if (!test_range_bit(extent_ins, start, end,
-				    EXTENT_LOCKED, 0)) {
+				    EXTENT_WRITEBACK, 0)) {
+			mutex_unlock(&info->extent_ins_mutex);
 free_extent:
 			ret = __free_extent(trans, extent_root,
 					    start, end + 1 - start,
 					    extent_op->orig_parent,
 					    extent_root->root_key.objectid,
 					    extent_op->orig_generation,
-					    extent_op->level, 0, mark_free);
+					    extent_op->level, 1, 0);
 			kfree(extent_op);
 		} else {
 			kfree(extent_op);
-			ret = get_state_private(extent_ins, start, &priv);
+
+			ret = get_state_private(&info->extent_ins, start,
+						&priv);
 			BUG_ON(ret);
 			extent_op = (struct pending_extent_op *)
-							(unsigned long)priv;
+						(unsigned long)priv;
+
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_WRITEBACK, GFP_NOFS);
 
-			clear_extent_bits(extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			if (extent_op->type == PENDING_BACKREF_UPDATE)
 				goto free_extent;
 
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, start,
+					     end + 1 - start, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
 			ret = update_block_group(trans, extent_root, start,
-						end + 1 - start, 0, mark_free);
+						end + 1 - start, 0, ret > 0);
+
 			BUG_ON(ret);
 			kfree(extent_op);
 		}
 		if (ret)
 			err = ret;
+		unlock_extent(extent_ins, start, end, GFP_NOFS);
 
-		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
+		search = 0;
+		cond_resched();
 	}
 	return err;
 }
@@ -2091,11 +2123,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		extent_op->orig_generation = ref_generation;
 		extent_op->level = (int)owner_objectid;
 
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_WRITEBACK, GFP_NOFS);
 		set_state_private(&root->fs_info->pending_del,
 				  bytenr, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		return 0;
 	}
 	/* if metadata always pin */
@@ -2134,11 +2168,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	maybe_lock_mutex(root);
 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
 				  owner_objectid, pin);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2214,12 +2246,16 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		 * group thats not of the proper type, while looping this
 		 * should never happen
 		 */
+		WARN_ON(!block_group);
+		mutex_lock(&block_group->alloc_mutex);
 		if (unlikely(!block_group_bits(block_group, data)))
 			goto new_group;
 
 		ret = cache_block_group(root, block_group);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&block_group->alloc_mutex);
 			break;
+		}
 
 		if (block_group->ro)
 			goto new_group;
@@ -2250,8 +2286,10 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				 * then we just re-search this block group
 				 */
 				if (search_start >= start &&
-				    search_start < end)
+				    search_start < end) {
+					mutex_unlock(&block_group->alloc_mutex);
 					continue;
+				}
 
 				/* else we go to the next block group */
 				goto new_group;
@@ -2259,10 +2297,15 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 			ins->objectid = search_start;
 			ins->offset = num_bytes;
+
+			btrfs_remove_free_space_lock(block_group, search_start,
+						     num_bytes);
 			/* we are all good, lets return */
+			mutex_unlock(&block_group->alloc_mutex);
 			break;
 		}
 new_group:
+		mutex_unlock(&block_group->alloc_mutex);
 		/*
 		 * Here's how this works.
 		 * loop == 0: we were searching a block group via a hint
@@ -2363,7 +2406,6 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	u64 search_start = 0;
 	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_block_group_cache *cache;
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -2419,13 +2461,6 @@ again:
 		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
-	cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
-		return -ENOSPC;
-	}
-
-	ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
 
 	return ret;
 }
@@ -2434,16 +2469,13 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	struct btrfs_block_group_cache *cache;
 
-	maybe_lock_mutex(root);
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
 		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
-		maybe_unlock_mutex(root);
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
 	update_reserved_extents(root, start, len, 0);
-	maybe_unlock_mutex(root);
 	return 0;
 }
 
@@ -2455,12 +2487,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 data)
 {
 	int ret;
-	maybe_lock_mutex(root);
 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
 				     empty_size, hint_byte, search_end, ins,
 				     data);
 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2510,11 +2540,13 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 		extent_op->orig_generation = 0;
 		extent_op->level = (int)owner;
 
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_WRITEBACK, GFP_NOFS);
 		set_state_private(&root->fs_info->extent_ins,
 				  ins->objectid, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		goto update_block;
 	}
 
@@ -2578,11 +2610,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2599,15 +2629,16 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group_cache *block_group;
 
-	maybe_lock_mutex(root);
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	mutex_lock(&block_group->alloc_mutex);
 	cache_block_group(root, block_group);
 
-	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
+	ret = btrfs_remove_free_space_lock(block_group, ins->objectid,
+					   ins->offset);
+	mutex_unlock(&block_group->alloc_mutex);
 	BUG_ON(ret);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2627,8 +2658,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	maybe_lock_mutex(root);
-
 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
@@ -2642,7 +2671,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	} else {
 		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2734,12 +2762,10 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 
-		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, 0);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2758,12 +2784,10 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_info *info = ref->extents;
 
 	for (i = 0; i < ref->nritems; i++) {
-		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
 					  info->objectid, 0);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
@@ -2875,13 +2899,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 
-			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
 						root_owner, root_gen,
 						*level - 1, 1);
 			BUG_ON(ret);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 
 			atomic_inc(&root->fs_info->throttle_gen);
 			wake_up(&root->fs_info->transaction_throttle);
@@ -2957,11 +2979,9 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -3440,8 +3460,6 @@ static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
-
 	if (first_time) {
 		ref_path->lowest_level = -1;
 		ref_path->current_level = -1;
@@ -3498,9 +3516,7 @@ next:
 		level--;
 		btrfs_release_path(extent_root, path);
 		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
 		}
 	}
 	/* reached lowest level */
@@ -3613,15 +3629,12 @@ found:
 
 		btrfs_release_path(extent_root, path);
 		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
 		}
 	}
 	/* reached max tree level, but no tree root found. */
 	BUG();
 out:
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -4556,14 +4569,6 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 					struct btrfs_ref_path *ref_path)
 {
 	int ret;
-	int needs_lock = 0;
-
-	if (root == root->fs_info->extent_root ||
-	    root == root->fs_info->chunk_root ||
-	    root == root->fs_info->dev_root) {
-		needs_lock = 1;
-		mutex_lock(&root->fs_info->alloc_mutex);
-	}
 
 	ret = relocate_one_path(trans, root, path, first_key,
 				ref_path, NULL, NULL);
@@ -4571,8 +4576,6 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 
 	if (root == root->fs_info->extent_root)
 		btrfs_extent_post_op(trans, root);
-	if (needs_lock)
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 	return 0;
 }
@@ -4584,14 +4587,12 @@ static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
 	if (ret)
 		goto out;
 	ret = btrfs_del_item(trans, extent_root, path);
 out:
 	btrfs_release_path(extent_root, path);
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -4627,7 +4628,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	struct btrfs_key first_key;
 	u64 prev_block = 0;
 
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 
 	trans = btrfs_start_transaction(extent_root, 1);
 	BUG_ON(!trans);
@@ -4754,7 +4754,6 @@ out:
 	btrfs_end_transaction(trans, extent_root);
 	kfree(new_extents);
 	kfree(ref_path);
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -4807,10 +4806,8 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	spin_lock(&shrink_block_group->lock);
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
 		spin_unlock(&shrink_block_group->lock);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		trans = btrfs_start_transaction(root, 1);
-		mutex_lock(&root->fs_info->alloc_mutex);
 		spin_lock(&shrink_block_group->lock);
 
 		new_alloc_flags = update_block_group_flags(root,
@@ -4826,9 +4823,7 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
 
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_end_transaction(trans, root);
-		mutex_lock(&root->fs_info->alloc_mutex);
 	} else
 		spin_unlock(&shrink_block_group->lock);
 	return 0;
@@ -4952,14 +4947,10 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	reloc_inode = create_reloc_inode(info, block_group);
 	BUG_ON(IS_ERR(reloc_inode));
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	__alloc_chunk_for_shrink(root, block_group, 1);
 	block_group->ro = 1;
 	block_group->space_info->total_bytes -= block_group->key.offset;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
-
 	btrfs_start_delalloc_inodes(info->tree_root);
 	btrfs_wait_ordered_extents(info->tree_root, 0);
 again:
@@ -4978,8 +4969,6 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -5007,9 +4996,7 @@ next:
 
 		if (progress && need_resched()) {
 			btrfs_release_path(root, path);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
 			progress = 0;
 			continue;
 		}
@@ -5036,7 +5023,6 @@ next:
 	}
 
 	btrfs_release_path(root, path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	if (pass == 0) {
 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
@@ -5058,8 +5044,6 @@ next:
 	trans = btrfs_start_transaction(info->tree_root, 1);
 	btrfs_commit_transaction(trans, info->tree_root);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	spin_lock(&block_group->lock);
 	WARN_ON(block_group->pinned > 0);
 	WARN_ON(block_group->reserved > 0);
@@ -5067,7 +5051,6 @@ next:
 	spin_unlock(&block_group->lock);
 	ret = 0;
 out:
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -5114,7 +5097,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	struct btrfs_block_group_cache *block_group;
 	struct rb_node *n;
 
-	mutex_lock(&info->alloc_mutex);
 	spin_lock(&info->block_group_cache_lock);
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -5132,7 +5114,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		kfree(block_group);
 	}
 	spin_unlock(&info->block_group_cache_lock);
-	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
 
@@ -5155,7 +5136,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
@@ -5174,6 +5154,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		}
 
 		spin_lock_init(&cache->lock);
+		mutex_init(&cache->alloc_mutex);
 		INIT_LIST_HEAD(&cache->list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -5201,7 +5182,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -5214,7 +5194,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_root *extent_root;
 	struct btrfs_block_group_cache *cache;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
 
 	root->fs_info->last_trans_new_blockgroup = trans->transid;
@@ -5226,6 +5205,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	spin_lock_init(&cache->lock);
+	mutex_init(&cache->alloc_mutex);
 	INIT_LIST_HEAD(&cache->list);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
@@ -5264,7 +5244,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	root = root->fs_info->extent_root;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 314041fdfa4..7503bd46819 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -938,6 +938,20 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 }
 EXPORT_SYMBOL(lock_extent);
 
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+			     &failed_start, mask);
+	if (err == -EEXIST)
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL(try_lock_extent);
+
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		  gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 86f859b87a6..283110ec4ee 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -128,6 +128,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 			     gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent);
 int __init extent_io_init(void);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 96241f01fa0..f4926c0f3c8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -184,8 +184,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+				  u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *right_info;
 	struct btrfs_free_space *left_info;
@@ -202,8 +202,6 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	 * are adding, if there is remove that struct and add a new one to
 	 * cover the entire range
 	 */
-	spin_lock(&block_group->lock);
-
 	right_info = tree_search_offset(&block_group->free_space_offset,
 					offset+bytes, 0, 1);
 	left_info = tree_search_offset(&block_group->free_space_offset,
@@ -261,7 +259,6 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	if (ret)
 		kfree(info);
 out:
-	spin_unlock(&block_group->lock);
 	if (ret) {
 		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
 		if (ret == -EEXIST)
@@ -274,13 +271,13 @@ out:
 	return ret;
 }
 
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-			    u64 offset, u64 bytes)
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			  u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
 	int ret = 0;
 
-	spin_lock(&block_group->lock);
 	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
 				  1);
 
@@ -334,17 +331,63 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 		/* step two, insert a new info struct to cover anything
 		 * before the hole
 		 */
-		spin_unlock(&block_group->lock);
-		ret = btrfs_add_free_space(block_group, old_start,
-					   offset - old_start);
+		ret = __btrfs_add_free_space(block_group, old_start,
+					     offset - old_start);
 		BUG_ON(ret);
-		goto out_nolock;
 	} else {
 		WARN_ON(1);
 	}
 out:
-	spin_unlock(&block_group->lock);
-out_nolock:
+	return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	int ret = 0;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes)
+{
+	int ret;
+
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+
 	return ret;
 }
 
@@ -386,18 +429,18 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	struct btrfs_free_space *info;
 	struct rb_node *node;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
 		info = rb_entry(node, struct btrfs_free_space, bytes_index);
 		unlink_free_space(block_group, info);
 		kfree(info);
 		if (need_resched()) {
-			spin_unlock(&block_group->lock);
+			mutex_unlock(&block_group->alloc_mutex);
 			cond_resched();
-			spin_lock(&block_group->lock);
+			mutex_lock(&block_group->alloc_mutex);
 		}
 	}
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 }
 
 struct btrfs_free_space *btrfs_find_free_space_offset(struct
@@ -407,10 +450,10 @@ struct btrfs_free_space *btrfs_find_free_space_offset(struct
 {
 	struct btrfs_free_space *ret;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 	ret = tree_search_offset(&block_group->free_space_offset, offset,
 				 bytes, 0);
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 
 	return ret;
 }
@@ -422,10 +465,10 @@ struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 {
 	struct btrfs_free_space *ret;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 
 	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 
 	return ret;
 }
@@ -434,16 +477,13 @@ struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
 					       *block_group, u64 offset,
 					       u64 bytes)
 {
-	struct btrfs_free_space *ret;
+	struct btrfs_free_space *ret = NULL;
 
-	spin_lock(&block_group->lock);
 	ret = tree_search_offset(&block_group->free_space_offset, offset,
 				 bytes, 0);
 	if (!ret)
 		ret = tree_search_bytes(&block_group->free_space_bytes,
 					offset, bytes);
 
-	spin_unlock(&block_group->lock);
-
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1df67129cc3..48b455fdaac 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -670,7 +670,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		atomic_dec(&root->fs_info->throttles);
 		wake_up(&root->fs_info->transaction_throttle);
 
-		mutex_lock(&root->fs_info->alloc_mutex);
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
@@ -678,7 +677,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 6f57d0889b1..a6a3956cedf 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -125,9 +125,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (is_extent)
 		btrfs_extent_post_op(trans, root);
 out:
-	if (is_extent)
-		mutex_unlock(&root->fs_info->alloc_mutex);
-
 	if (path)
 		btrfs_free_path(path);
 	if (ret == -EAGAIN) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e6d579053a4..835daed5561 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -271,10 +271,10 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct walk_control *wc, u64 gen)
 {
 	if (wc->pin) {
-		mutex_lock(&log->fs_info->alloc_mutex);
+		mutex_lock(&log->fs_info->pinned_mutex);
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
 					    eb->start, eb->len, 1);
-		mutex_unlock(&log->fs_info->alloc_mutex);
+		mutex_unlock(&log->fs_info->pinned_mutex);
 	}
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7db4cfd03a9..cbb9bb31431 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -58,14 +58,12 @@ void btrfs_unlock_volumes(void)
 
 static void lock_chunks(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->alloc_mutex);
 	mutex_lock(&root->fs_info->chunk_mutex);
 }
 
 static void unlock_chunks(struct btrfs_root *root)
 {
 	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
-- 
cgit v1.2.3-70-g09d2


From 84234f3a1f7c532e4afeba03cc8e7e4a8a5277ea Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: Add root tree pointer transaction ids

This patch adds transaction IDs to root tree pointers.
Transaction IDs in tree pointers are compared with the
generation numbers in block headers when reading root
blocks of trees. This can detect some types of IO errors.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       |  8 ++++++++
 fs/btrfs/disk-io.c     | 21 ++++++++++++++-------
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/transaction.c | 13 +++++++++++++
 fs/btrfs/tree-log.c    |  2 ++
 6 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fdba4f1b634..0621ab90b1a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -297,6 +297,7 @@ struct btrfs_super_block {
 	__le32 leafsize;
 	__le32 stripesize;
 	__le32 sys_chunk_array_size;
+	__le64 chunk_root_generation;
 	u8 root_level;
 	u8 chunk_root_level;
 	u8 log_root_level;
@@ -448,6 +449,7 @@ struct btrfs_dir_item {
 
 struct btrfs_root_item {
 	struct btrfs_inode_item inode;
+	__le64 generation;
 	__le64 root_dirid;
 	__le64 bytenr;
 	__le64 byte_limit;
@@ -1396,10 +1398,14 @@ static inline int btrfs_is_leaf(struct extent_buffer *eb)
 }
 
 /* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+		   generation, 64);
 BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
 BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
 BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
 
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
 BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
@@ -1416,6 +1422,8 @@ BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
 BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
 			 struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+			 struct btrfs_super_block, chunk_root_generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 			 root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1137d7ea8d..94b4e50f6b2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -832,6 +832,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 {
 	int ret;
 	u32 blocksize;
+	u64 generation;
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
@@ -840,9 +841,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
 
+	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, 0);
+				     blocksize, generation);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -929,6 +931,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	struct btrfs_path *path;
 	struct extent_buffer *l;
 	u64 highest_inode;
+	u64 generation;
 	u32 blocksize;
 	int ret = 0;
 
@@ -970,9 +973,10 @@ out:
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, 0);
+				     blocksize, generation);
 	BUG_ON(!root->node);
 insert:
 	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1357,6 +1361,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 leafsize;
 	u32 blocksize;
 	u32 stripesize;
+	u64 generation;
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
@@ -1596,13 +1601,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_chunk_root_level(disk_super));
+	generation = btrfs_super_chunk_root_generation(disk_super);
 
 	__setup_root(nodesize, leafsize, sectorsize, stripesize,
 		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
-					   blocksize, 0);
+					   blocksize, generation);
 	BUG_ON(!chunk_root->node);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
@@ -1618,11 +1624,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
-
+	generation = btrfs_super_generation(disk_super);
 
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
-					  blocksize, 0);
+					  blocksize, generation);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
@@ -1672,15 +1678,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
 		log_tree_root->node = read_tree_block(tree_root, bytenr,
-						      blocksize, 0);
+						      blocksize,
+						      generation + 1);
 		ret = btrfs_recover_log_trees(log_tree_root);
 		BUG_ON(ret);
 	}
+	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 
 	ret = btrfs_cleanup_reloc_trees(tree_root);
 	BUG_ON(ret);
 
-	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 	return tree_root;
 
 fail_cleaner:
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 564260872c7..155c8dc56a2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4428,6 +4428,7 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_root_refs(root_item, 0);
 	btrfs_set_root_bytenr(root_item, eb->start);
 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
 
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1136ce2febc..fd3c8b5676c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -108,6 +108,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
 	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 1);
 	btrfs_set_root_used(&root_item, 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 48b455fdaac..924af6f2aea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -439,6 +439,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 				       root->node->start);
 		btrfs_set_root_level(&root->root_item,
 				     btrfs_header_level(root->node));
+		btrfs_set_root_generation(&root->root_item, trans->transid);
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
@@ -456,6 +457,12 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
+	struct extent_buffer *eb;
+
+	eb = btrfs_lock_root_node(fs_info->tree_root);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
 
 	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
@@ -559,6 +566,9 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 					      root->node->start);
 			btrfs_set_root_level(&root->root_item,
 					     btrfs_header_level(root->node));
+			btrfs_set_root_generation(&root->root_item,
+						  root->root_key.offset);
+
 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
@@ -756,6 +766,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	btrfs_set_root_bytenr(new_root_item, tmp->start);
 	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+	btrfs_set_root_generation(new_root_item, trans->transid);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
 	btrfs_tree_unlock(tmp);
@@ -946,6 +957,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
+	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+				btrfs_header_generation(chunk_root->node));
 
 	if (!root->fs_info->log_root_recovering) {
 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 835daed5561..e0201c3a7dc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -117,6 +117,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
 	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 0);
 	btrfs_set_root_used(&root_item, 0);
@@ -2065,6 +2066,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 		return 0;
 
 	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
 	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
 	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
 				&log->root_key, &log->root_item);
-- 
cgit v1.2.3-70-g09d2


From d899e05215178fed903ad0e7fc1cb4d8e0cc0a88 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:25:28 -0400
Subject: Btrfs: Add fallocate support v2 This patch updates btrfs-progs for
 fallocate support.

fallocate is a little different in Btrfs because we need to tell the
COW system that a given preallocated extent doesn't need to be
cow'd as long as there are no snapshots of it.  This leverages the
-o nodatacow checks.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h        |   9 +-
 fs/btrfs/extent-tree.c  |  65 +++++-----
 fs/btrfs/extent_io.c    |  12 +-
 fs/btrfs/extent_map.h   |   1 +
 fs/btrfs/file.c         | 245 +++++++++++++++++++++++++++++++++++-
 fs/btrfs/inode.c        | 323 +++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ioctl.c        |   3 +-
 fs/btrfs/ordered-data.c |   5 +-
 fs/btrfs/ordered-data.h |   4 +-
 fs/btrfs/tree-log.c     |  13 +-
 10 files changed, 563 insertions(+), 117 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8bf6a085a73..d5ba3d1aaf9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -462,8 +462,9 @@ struct btrfs_root_item {
 	u8 level;
 } __attribute__ ((__packed__));
 
-#define BTRFS_FILE_EXTENT_REG 0
-#define BTRFS_FILE_EXTENT_INLINE 1
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
 
 struct btrfs_file_extent_item {
 	/*
@@ -868,6 +869,7 @@ struct btrfs_root {
 #define BTRFS_INODE_NODATACOW		(1 << 1)
 #define BTRFS_INODE_READONLY		(1 << 2)
 #define BTRFS_INODE_NOCOMPRESS		(1 << 3)
+#define BTRFS_INODE_PREALLOC		(1 << 4)
 #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
 					 ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
@@ -1924,6 +1926,9 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
 
 /* tree-defrag.c */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1eb69a91b72..8af39521eb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2147,6 +2147,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	total_needed += empty_size;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+	if (!block_group)
+		block_group = btrfs_lookup_first_block_group(root->fs_info,
+							     search_start);
 	space_info = __find_space_info(root->fs_info, data);
 
 	down_read(&space_info->groups_sem);
@@ -3426,9 +3429,7 @@ walk_down:
 next:
 		level--;
 		btrfs_release_path(extent_root, path);
-		if (need_resched()) {
-			cond_resched();
-		}
+		cond_resched();
 	}
 	/* reached lowest level */
 	ret = 1;
@@ -3539,9 +3540,7 @@ found:
 		}
 
 		btrfs_release_path(extent_root, path);
-		if (need_resched()) {
-			cond_resched();
-		}
+		cond_resched();
 	}
 	/* reached max tree level, but no tree root found. */
 	BUG();
@@ -3654,8 +3653,9 @@ static int noinline get_new_locations(struct inode *reloc_inode,
 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
 									   fi);
-		WARN_ON(exts[nr].offset > 0);
-		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+		BUG_ON(exts[nr].offset > 0);
+		BUG_ON(exts[nr].compression || exts[nr].encryption);
+		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
 		cur_pos += exts[nr].num_bytes;
 		nr++;
@@ -3709,6 +3709,7 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int nr_scaned = 0;
 	int extent_locked = 0;
+	int extent_type;
 	int ret;
 
 	memcpy(&key, leaf_key, sizeof(key));
@@ -3781,8 +3782,9 @@ next:
 		}
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
-		if ((btrfs_file_extent_type(leaf, fi) !=
-		     BTRFS_FILE_EXTENT_REG) ||
+		extent_type = btrfs_file_extent_type(leaf, fi);
+		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
 		     extent_key->objectid)) {
 			path->slots[0]++;
@@ -3865,16 +3867,10 @@ next:
 
 		if (nr_extents == 1) {
 			/* update extent pointer in place */
-			btrfs_set_file_extent_generation(leaf, fi,
-						trans->transid);
 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extents[0].disk_bytenr);
 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[0].disk_num_bytes);
-			btrfs_set_file_extent_ram_bytes(leaf, fi,
-						new_extents[0].ram_bytes);
-			ext_offset += new_extents[0].offset;
-			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
 			btrfs_mark_buffer_dirty(leaf);
 
 			btrfs_drop_extent_cache(inode, key.offset,
@@ -3901,6 +3897,8 @@ next:
 			btrfs_release_path(root, path);
 			key.offset += num_bytes;
 		} else {
+			BUG_ON(1);
+#if 0
 			u64 alloc_hint;
 			u64 extent_len;
 			int i;
@@ -3977,6 +3975,7 @@ next:
 					break;
 			}
 			BUG_ON(i >= nr_extents);
+#endif
 		}
 
 		if (extent_locked) {
@@ -4156,15 +4155,10 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
 
-		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_set_file_extent_ram_bytes(leaf, fi,
-						new_extent->ram_bytes);
 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extent->disk_bytenr);
 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extent->disk_num_bytes);
-		new_extent->offset += btrfs_file_extent_offset(leaf, fi);
-		btrfs_set_file_extent_offset(leaf, fi, new_extent->offset);
 		btrfs_mark_buffer_dirty(leaf);
 
 		ret = btrfs_inc_extent_ref(trans, root,
@@ -4625,12 +4619,15 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 			 */
 			if (!new_extents) {
 				u64 group_start = group->key.objectid;
+				new_extents = kmalloc(sizeof(*new_extents),
+						      GFP_NOFS);
+				nr_extents = 1;
 				ret = get_new_locations(reloc_inode,
 							extent_key,
-							group_start, 0,
+							group_start, 1,
 							&new_extents,
 							&nr_extents);
-				if (ret < 0)
+				if (ret)
 					goto out;
 			}
 			btrfs_record_root_in_trans(found_root);
@@ -4762,7 +4759,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_generation(leaf, item, 1);
 	btrfs_set_inode_size(leaf, item, size);
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM |
+					  BTRFS_INODE_NOCOMPRESS);
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root, path);
 out:
@@ -4835,6 +4833,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	struct inode *reloc_inode;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key key;
+	u64 skipped;
 	u64 cur_byte;
 	u64 total_found;
 	u32 nritems;
@@ -4864,6 +4863,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	btrfs_start_delalloc_inodes(info->tree_root);
 	btrfs_wait_ordered_extents(info->tree_root, 0);
 again:
+	skipped = 0;
 	total_found = 0;
 	progress = 0;
 	key.objectid = block_group->key.objectid;
@@ -4926,6 +4926,8 @@ next:
 		ret = relocate_one_extent(root, path, &key, block_group,
 					  reloc_inode, pass);
 		BUG_ON(ret < 0);
+		if (ret > 0)
+			skipped++;
 
 		key.objectid = cur_byte;
 		key.type = 0;
@@ -4944,6 +4946,11 @@ next:
 		printk("btrfs found %llu extents in pass %d\n",
 		       (unsigned long long)total_found, pass);
 		pass++;
+		if (total_found == skipped && pass > 2) {
+			iput(reloc_inode);
+			reloc_inode = create_reloc_inode(info, block_group);
+			pass = 0;
+		}
 		goto again;
 	}
 
@@ -5011,17 +5018,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
 				       cache_node);
-
-		spin_unlock(&info->block_group_cache_lock);
-		btrfs_remove_free_space_cache(block_group);
-		spin_lock(&info->block_group_cache_lock);
-
 		rb_erase(&block_group->cache_node,
 			 &info->block_group_cache_tree);
+		spin_unlock(&info->block_group_cache_lock);
+
+		btrfs_remove_free_space_cache(block_group);
 		down_write(&block_group->space_info->groups_sem);
 		list_del(&block_group->list);
 		up_write(&block_group->space_info->groups_sem);
 		kfree(block_group);
+
+		spin_lock(&info->block_group_cache_lock);
 	}
 	spin_unlock(&info->block_group_cache_lock);
 	return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 65a0583027e..eb3c12e7bea 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2015,6 +2015,8 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		bdev = em->bdev;
 		block_start = em->block_start;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			block_start = EXTENT_MAP_HOLE;
 		free_extent_map(em);
 		em = NULL;
 
@@ -2769,14 +2771,18 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 	struct inode *inode = mapping->host;
 	u64 start = iblock << inode->i_blkbits;
 	sector_t sector = 0;
+	size_t blksize = (1 << inode->i_blkbits);
 	struct extent_map *em;
 
-	em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
+	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+		    GFP_NOFS);
+	em = get_extent(inode, NULL, 0, start, blksize, 0);
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+		      GFP_NOFS);
 	if (!em || IS_ERR(em))
 		return 0;
 
-	if (em->block_start == EXTENT_MAP_INLINE ||
-	    em->block_start == EXTENT_MAP_HOLE)
+	if (em->block_start > EXTENT_MAP_LAST_BYTE)
 		goto out;
 
 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e693e1b4ac4..accfedaeb51 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -12,6 +12,7 @@
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 
 struct extent_map {
 	struct rb_node rb_node;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1a0510ad030..238a8e215eb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -381,7 +381,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int keep;
 	int slot;
 	int bookend;
-	int found_type;
+	int found_type = 0;
 	int found_extent;
 	int found_inline;
 	int recow;
@@ -442,7 +442,8 @@ next_slot:
 								  extent);
 			other_encoding = btrfs_file_extent_other_encoding(leaf,
 								  extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 				extent_end =
 				     btrfs_file_extent_disk_bytenr(leaf,
 								   extent);
@@ -609,8 +610,7 @@ next_slot:
 			 */
 			btrfs_set_file_extent_ram_bytes(leaf, extent,
 							ram_bytes);
-			btrfs_set_file_extent_type(leaf, extent,
-						   BTRFS_FILE_EXTENT_REG);
+			btrfs_set_file_extent_type(leaf, extent, found_type);
 
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 
@@ -661,6 +661,243 @@ out:
 	return ret;
 }
 
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+			    u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+	    btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if ((*start && *start != key.offset) || (*end && *end != extent_end))
+		return 0;
+
+	*start = key.offset;
+	*end = extent_end;
+	return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode, u64 start, u64 end)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 extent_end;
+	u64 extent_offset;
+	u64 other_start;
+	u64 other_end;
+	u64 split = start;
+	u64 locked_end = end;
+	int extent_type;
+	int split_end = 1;
+	int ret;
+
+	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (split == start)
+		key.offset = split;
+	else
+		key.offset = split - 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	BUG_ON(key.objectid != inode->i_ino ||
+	       key.type != BTRFS_EXTENT_DATA_KEY);
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(leaf, fi);
+	BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	BUG_ON(key.offset > start || extent_end < end);
+
+	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+	extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (key.offset == start)
+		split = end;
+
+	if (key.offset == start && extent_end == end) {
+		int del_nr = 0;
+		int del_slot = 0;
+		u64 leaf_owner = btrfs_header_owner(leaf);
+		u64 leaf_gen = btrfs_header_generation(leaf);
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			extent_end = other_end;
+			del_slot = path->slots[0] + 1;
+			del_nr++;
+			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			key.offset = other_start;
+			del_slot = path->slots[0];
+			del_nr++;
+			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+		split_end = 0;
+		if (del_nr == 0) {
+			btrfs_set_file_extent_type(leaf, fi,
+						   BTRFS_FILE_EXTENT_REG);
+			goto done;
+		}
+
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - key.offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
+		goto done;
+	} else if (split == start) {
+		if (locked_end < extent_end) {
+			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+			if (!ret) {
+				btrfs_release_path(root, path);
+				lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+				locked_end = extent_end;
+				goto again;
+			}
+			locked_end = extent_end;
+		}
+		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+		extent_offset += split - key.offset;
+	} else  {
+		BUG_ON(key.offset != start);
+		btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+					     split - key.offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+		key.offset = split;
+		btrfs_set_item_key_safe(trans, root, path, &key);
+		extent_end = split;
+	}
+
+	if (extent_end == end) {
+		split_end = 0;
+		extent_type = BTRFS_FILE_EXTENT_REG;
+	}
+	if (extent_end == end && split == start) {
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			path->slots[0]++;
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			key.offset = split;
+			btrfs_set_item_key_safe(trans, root, path, &key);
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							other_end - split);
+			goto done;
+		}
+	}
+	if (extent_end == end && split == end) {
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			path->slots[0]--;
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+							other_start);
+			goto done;
+		}
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	key.offset = start;
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, 0);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				   leaf->start, root->root_key.objectid,
+				   trans->transid, inode->i_ino);
+	BUG_ON(ret);
+done:
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+	if (split_end && split == start) {
+		split = end;
+		goto again;
+	}
+	if (locked_end > end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+			      GFP_NOFS);
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 /*
  * this gets pages into the page cache and locks them down, it also properly
  * waits for data=ordered extents to finish before allowing the pages to be
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e6f0568fdb..789c376157f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
+#include <linux/falloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -587,7 +588,7 @@ free_pages_out:
  * blocks on disk
  */
 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
-			      u64 start, u64 end, int *page_started)
+			      u64 start, u64 end, int *page_started, int force)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -602,6 +603,7 @@ static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
 	u64 num_bytes;
 	int extent_type;
 	int ret;
+	int type;
 	int nocow;
 	int check_prev = 1;
 
@@ -654,7 +656,8 @@ next_slot:
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
 
-		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			struct btrfs_block_group_cache *block_group;
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			extent_end = found_key.offset +
@@ -669,6 +672,8 @@ next_slot:
 				goto out_check;
 			if (disk_bytenr == 0)
 				goto out_check;
+			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
 				goto out_check;
 			block_group = btrfs_lookup_block_group(root->fs_info,
@@ -709,10 +714,39 @@ out_check:
 
 		disk_bytenr += cur_offset - found_key.offset;
 		num_bytes = min(end + 1, extent_end) - cur_offset;
+		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			struct extent_map *em;
+			struct extent_map_tree *em_tree;
+			em_tree = &BTRFS_I(inode)->extent_tree;
+			em = alloc_extent_map(GFP_NOFS);
+			em->start = cur_offset;
+			em->len = num_bytes;
+			em->block_len = num_bytes;
+			em->block_start = disk_bytenr;
+			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			while (1) {
+				spin_lock(&em_tree->lock);
+				ret = add_extent_mapping(em_tree, em);
+				spin_unlock(&em_tree->lock);
+				if (ret != -EEXIST) {
+					free_extent_map(em);
+					break;
+				}
+				btrfs_drop_extent_cache(inode, em->start,
+						em->start + em->len - 1, 0);
+			}
+			type = BTRFS_ORDERED_PREALLOC;
+		} else {
+			type = BTRFS_ORDERED_NOCOW;
+		}
 
 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
-					       num_bytes, num_bytes,
-					       BTRFS_ORDERED_NOCOW);
+					       num_bytes, num_bytes, type);
+		BUG_ON(ret);
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					cur_offset, cur_offset + num_bytes - 1,
+					locked_page, 0, 0, 0);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -745,7 +779,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started);
+					 page_started, 0);
+	else if (btrfs_test_flag(inode, PREALLOC))
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 1);
 	else
 		ret = cow_file_range(inode, locked_page, start, end,
 				     page_started);
@@ -1006,6 +1043,63 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	return -EAGAIN;
 }
 
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+				       struct inode *inode, u64 file_pos,
+				       u64 disk_bytenr, u64 disk_num_bytes,
+				       u64 num_bytes, u64 ram_bytes,
+				       u8 compression, u8 encryption,
+				       u16 other_encoding, int extent_type)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	u64 hint;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+				 file_pos + num_bytes, file_pos, &hint);
+	BUG_ON(ret);
+
+	ins.objectid = inode->i_ino;
+	ins.offset = file_pos;
+	ins.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+	BUG_ON(ret);
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, 0);
+	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_add_bytes(inode, num_bytes);
+	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+
+	ins.objectid = disk_bytenr;
+	ins.offset = disk_num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+					  root->root_key.objectid,
+					  trans->transid, inode->i_ino, &ins);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1016,12 +1110,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_file_extent_item *extent_item;
-	struct btrfs_path *path = NULL;
-	struct extent_buffer *leaf;
-	u64 alloc_hint = 0;
-	struct list_head list;
-	struct btrfs_key ins;
+	int compressed = 0;
 	int ret;
 
 	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
@@ -1035,67 +1124,30 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
 	lock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
 
-	INIT_LIST_HEAD(&list);
-
-	ret = btrfs_drop_extents(trans, root, inode,
-				 ordered_extent->file_offset,
-				 ordered_extent->file_offset +
-				 ordered_extent->len,
-				 ordered_extent->file_offset, &alloc_hint);
-	BUG_ON(ret);
-
-	ins.objectid = inode->i_ino;
-	ins.offset = ordered_extent->file_offset;
-	ins.type = BTRFS_EXTENT_DATA_KEY;
-	ret = btrfs_insert_empty_item(trans, root, path, &ins,
-				      sizeof(*extent_item));
-	BUG_ON(ret);
-	leaf = path->nodes[0];
-	extent_item = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
-	btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
-	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
-					  ordered_extent->start);
-	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-					     ordered_extent->disk_len);
-	btrfs_set_file_extent_offset(leaf, extent_item, 0);
-
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-		btrfs_set_file_extent_compression(leaf, extent_item, 1);
-	else
-		btrfs_set_file_extent_compression(leaf, extent_item, 0);
-	btrfs_set_file_extent_encryption(leaf, extent_item, 0);
-	btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
-
-	/* ram bytes = extent_num_bytes for now */
-	btrfs_set_file_extent_num_bytes(leaf, extent_item,
-					ordered_extent->len);
-	btrfs_set_file_extent_ram_bytes(leaf, extent_item,
-					ordered_extent->len);
-	btrfs_mark_buffer_dirty(leaf);
-
-	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
-				ordered_extent->file_offset +
-				ordered_extent->len - 1, 0);
-
-	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->disk_len;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
-					  root->root_key.objectid,
-					  trans->transid, inode->i_ino, &ins);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
-
-	inode_add_bytes(inode, ordered_extent->len);
+		compressed = 1;
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+		BUG_ON(compressed);
+		ret = btrfs_mark_extent_written(trans, root, inode,
+						ordered_extent->file_offset,
+						ordered_extent->file_offset +
+						ordered_extent->len);
+		BUG_ON(ret);
+	} else {
+		ret = insert_reserved_file_extent(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->start,
+						ordered_extent->disk_len,
+						ordered_extent->len,
+						ordered_extent->len,
+						compressed, 0, 0,
+						BTRFS_FILE_EXTENT_REG);
+		BUG_ON(ret);
+	}
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
@@ -1115,8 +1167,6 @@ nocow:
 	btrfs_put_ordered_extent(ordered_extent);
 
 	btrfs_end_transaction(trans, root);
-	if (path)
-		btrfs_free_path(path);
 	return 0;
 }
 
@@ -3488,7 +3538,8 @@ again:
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
 	compressed = btrfs_file_extent_compression(leaf, item);
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -3521,7 +3572,8 @@ again:
 		goto not_found_em;
 	}
 
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
@@ -3538,6 +3590,8 @@ again:
 			bytenr += btrfs_file_extent_offset(leaf, item);
 			em->block_start = bytenr;
 			em->block_len = em->len;
+			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -3969,6 +4023,7 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 	if (error)
 		return error;
 
+	atomic_inc(&inode->i_count);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -4318,6 +4373,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len - 1);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
@@ -4335,6 +4391,130 @@ out_fail:
 	return err;
 }
 
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+			       u64 alloc_hint, int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key ins;
+	u64 alloc_size;
+	u64 cur_offset = start;
+	u64 num_bytes = end - start;
+	int ret = 0;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
+
+	while (num_bytes > 0) {
+		alloc_size = min(num_bytes, root->fs_info->max_extent);
+		ret = btrfs_reserve_extent(trans, root, alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		if (ret) {
+			WARN_ON(1);
+			goto out;
+		}
+		ret = insert_reserved_file_extent(trans, inode,
+						  cur_offset, ins.objectid,
+						  ins.offset, ins.offset,
+						  ins.offset, 0, 0, 0,
+						  BTRFS_FILE_EXTENT_PREALLOC);
+		BUG_ON(ret);
+		num_bytes -= ins.offset;
+		cur_offset += ins.offset;
+		alloc_hint = ins.objectid + ins.offset;
+	}
+out:
+	if (cur_offset > start) {
+		inode->i_ctime = CURRENT_TIME;
+		btrfs_set_flag(inode, PREALLOC);
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    cur_offset > i_size_read(inode))
+			btrfs_i_size_write(inode, cur_offset);
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+	}
+
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+static long btrfs_fallocate(struct inode *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	mutex_lock(&inode->i_mutex);
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
+			    alloc_end - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      alloc_start, alloc_end - 1, GFP_NOFS);
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE) {
+			ret = prealloc_file_range(inode, cur_offset,
+					last_byte, alloc_hint, mode);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+			alloc_hint = em->block_start;
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+		      GFP_NOFS);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 static int btrfs_set_page_dirty(struct page *page)
 {
 	return __set_page_dirty_nobuffers(page);
@@ -4421,6 +4601,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
+	.fallocate	= btrfs_fallocate,
 };
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7f915d47839..9ff2b4e0e92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -724,7 +724,8 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
 								       extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e7317c8fda2..370bb428559 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -182,7 +182,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->len = len;
 	entry->disk_len = disk_len;
 	entry->inode = inode;
-	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_COMPRESSED)
+	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
 	/* one ref for the tree */
@@ -339,7 +339,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 		ordered = list_entry(cur, struct btrfs_ordered_extent,
 				     root_extent_list);
 		if (nocow_only &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
 			list_move(&ordered->root_extent_list,
 				  &root->fs_info->ordered_extents);
 			cond_resched_lock(&root->fs_info->ordered_extent_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e6d9bc54c2b..260bf95dfe0 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,6 +68,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
 
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -132,7 +134,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int type);
+			     u64 start, u64 len, u64 disk_len, int tyep);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e0201c3a7dc..be4fc30a30e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -442,7 +442,8 @@ insert:
 
 		fi = (struct btrfs_file_extent_item *)dst_ptr;
 		extent_type = btrfs_file_extent_type(path->nodes[0], fi);
-		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			struct btrfs_key ins;
 			ins.objectid = btrfs_file_extent_disk_bytenr(
 							path->nodes[0], fi);
@@ -538,7 +539,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 	found_type = btrfs_file_extent_type(eb, item);
 
-	if (found_type == BTRFS_FILE_EXTENT_REG)
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size = btrfs_file_extent_inline_len(eb, item);
@@ -562,7 +564,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 				       start, 0);
 
-	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
+	if (ret == 0 &&
+	    (found_type == BTRFS_FILE_EXTENT_REG ||
+	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 		struct btrfs_file_extent_item cmp1;
 		struct btrfs_file_extent_item cmp2;
 		struct btrfs_file_extent_item *existing;
@@ -2522,7 +2526,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						struct btrfs_file_extent_item);
 
 			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 				u64 ds = btrfs_file_extent_disk_bytenr(src,
 								   extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
-- 
cgit v1.2.3-70-g09d2


From b2950863c61bc24cf0f63bc05947d9d50663c4c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 09:54:17 -0500
Subject: Btrfs: make things static and include the right headers

Shut up various sparse warnings about symbols that should be either
static or have their declarations in scope.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.c            |  2 +-
 fs/btrfs/disk-io.c          | 16 ++++++++--------
 fs/btrfs/extent-tree.c      | 12 ++++++------
 fs/btrfs/extent_io.c        | 35 +++++++++++++++--------------------
 fs/btrfs/free-space-cache.c |  6 ++++--
 fs/btrfs/inode-item.c       |  2 +-
 fs/btrfs/inode.c            | 26 +++++++++++++-------------
 fs/btrfs/ioctl.c            | 14 +++++++-------
 fs/btrfs/root-tree.c        |  2 ++
 fs/btrfs/super.c            |  2 +-
 fs/btrfs/tree-log.c         |  5 +++--
 fs/btrfs/volumes.c          | 12 ++++++------
 fs/btrfs/zlib.c             |  1 +
 13 files changed, 68 insertions(+), 67 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 71ef0a2e2da..a83cbdf1d8c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -217,7 +217,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
  * is used to finish the allocation.
  */
-int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
+static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb04665e500..8a2bcc7024f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -93,9 +93,9 @@ struct async_submit_bio {
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
  */
-struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 len,
-				    int create)
+static struct extent_map *btree_get_extent(struct inode *inode,
+		struct page *page, size_t page_offset, u64 start, u64 len,
+		int create)
 {
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_map *em;
@@ -295,7 +295,7 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
  * checksum a dirty tree block before IO.  This has extra checks to make
  * sure we only fill in the checksum field in the first page of a multi-page block
  */
-int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -365,7 +365,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
-int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
 	struct extent_io_tree *tree;
@@ -660,7 +660,7 @@ static int btree_writepages(struct address_space *mapping,
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
 
-int btree_readpage(struct file *file, struct page *page)
+static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1200,7 +1200,7 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	}
 }
 
-void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 	struct inode *inode;
 	struct extent_map_tree *em_tree;
@@ -1842,7 +1842,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-int write_all_supers(struct btrfs_root *root)
+static int write_all_supers(struct btrfs_root *root)
 {
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a970472eab1..d1563852938 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,7 +74,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  * this adds the block group to the fs_info rb tree for the block group
  * cache
  */
-int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 				struct btrfs_block_group_cache *block_group)
 {
 	struct rb_node **p;
@@ -289,7 +289,7 @@ err:
 /*
  * return the block group that starts at or after bytenr
  */
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
+static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 						       btrfs_fs_info *info,
 							 u64 bytenr)
 {
@@ -3445,7 +3445,7 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 			      u32 *refs)
 {
 	int ret;
@@ -5434,7 +5434,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
-int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		     struct btrfs_block_group_cache *shrink_block_group,
 		     int force)
 {
@@ -5703,8 +5703,8 @@ out:
 	return ret;
 }
 
-int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
-			   struct btrfs_key *key)
+static int find_first_block_group(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_key *key)
 {
 	int ret = 0;
 	struct btrfs_key found_key;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d79ccdbfdd9..c3dfe2a0ec8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -112,7 +112,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
-struct extent_state *alloc_extent_state(gfp_t mask)
+static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
 #ifdef LEAK_DEBUG
@@ -136,7 +136,7 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 }
 EXPORT_SYMBOL(alloc_extent_state);
 
-void free_extent_state(struct extent_state *state)
+static void free_extent_state(struct extent_state *state)
 {
 	if (!state)
 		return;
@@ -662,7 +662,7 @@ static void set_state_bits(struct extent_io_tree *tree,
  * [start, end] is inclusive
  * This takes the tree lock.
  */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 		   int exclusive, u64 *failed_start, gfp_t mask)
 {
 	struct extent_state *state;
@@ -879,12 +879,11 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_new);
 
-int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_new);
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			gfp_t mask)
@@ -894,27 +893,24 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_uptodate);
 
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			  gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_uptodate);
 
-int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			 gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
 			      0, NULL, mask);
 }
-EXPORT_SYMBOL(set_extent_writeback);
 
-int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			   gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_writeback);
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
@@ -994,7 +990,7 @@ EXPORT_SYMBOL(set_range_dirty);
 /*
  * helper function to set both pages and extents in the tree writeback
  */
-int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1010,7 +1006,6 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 	set_extent_writeback(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(set_range_writeback);
 
 /*
  * find the first offset in the io tree with 'bits' set. zero is
@@ -1432,11 +1427,13 @@ out:
 	spin_unlock_irq(&tree->lock);
 	return total_bytes;
 }
+
+#if 0
 /*
  * helper function to lock both pages and extents in the tree.
  * pages must be locked first.
  */
-int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1473,12 +1470,11 @@ failed:
 	}
 	return err;
 }
-EXPORT_SYMBOL(lock_range);
 
 /*
  * helper function to unlock both pages and extents in the tree.
  */
-int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1493,7 +1489,7 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 	unlock_extent(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(unlock_range);
+#endif
 
 /*
  * set the private field for a given byte offset in the tree.  If there isn't
@@ -1956,7 +1952,7 @@ void set_page_extent_mapped(struct page *page)
 }
 EXPORT_SYMBOL(set_page_extent_mapped);
 
-void set_page_extent_head(struct page *page, unsigned long len)
+static void set_page_extent_head(struct page *page, unsigned long len)
 {
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2397,7 +2393,7 @@ update_nr_written:
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
-int extent_write_cache_pages(struct extent_io_tree *tree,
+static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     struct address_space *mapping,
 			     struct writeback_control *wbc,
 			     writepage_t writepage, void *data,
@@ -2502,7 +2498,6 @@ retry:
 	}
 	return ret;
 }
-EXPORT_SYMBOL(extent_write_cache_pages);
 
 static noinline void flush_write_bio(void *data)
 {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4926c0f3c8..09462adfbe3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -443,7 +443,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	mutex_unlock(&block_group->alloc_mutex);
 }
 
-struct btrfs_free_space *btrfs_find_free_space_offset(struct
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
 						      btrfs_block_group_cache
 						      *block_group, u64 offset,
 						      u64 bytes)
@@ -458,7 +459,7 @@ struct btrfs_free_space *btrfs_find_free_space_offset(struct
 	return ret;
 }
 
-struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 						     btrfs_block_group_cache
 						     *block_group, u64 offset,
 						     u64 bytes)
@@ -472,6 +473,7 @@ struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 
 	return ret;
 }
+#endif
 
 struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
 					       *block_group, u64 offset,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index d93451c66ba..3d46fa1f29a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -20,7 +20,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-int find_name_in_backref(struct btrfs_path *path, const char * name,
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
 			 int name_len, struct btrfs_inode_ref **ref_ret)
 {
 	struct extent_buffer *leaf;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3d4078b69a..bd58ba655a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1130,7 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
 	unsigned long flags;
@@ -1151,7 +1151,7 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			 unsigned long old, unsigned long bits)
 {
 	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
@@ -1215,7 +1215,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1234,7 +1234,7 @@ int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1245,7 +1245,7 @@ int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * extent_io.c submission hook. This does the right thing for csum calculation on write,
  * or reading the csums from the tree before a read
  */
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1313,7 +1313,7 @@ struct btrfs_writepage_fixup {
 	struct btrfs_work work;
 };
 
-void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
@@ -1372,7 +1372,7 @@ out_page:
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
  */
-int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_writepage_fixup *fixup;
@@ -1526,7 +1526,7 @@ nocow:
 	return 0;
 }
 
-int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
@@ -1548,7 +1548,7 @@ struct io_failure_record {
 	int last_mirror;
 };
 
-int btrfs_io_failed_hook(struct bio *failed_bio,
+static int btrfs_io_failed_hook(struct bio *failed_bio,
 			 struct page *page, u64 start, u64 end,
 			 struct extent_state *state)
 {
@@ -1642,7 +1642,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-int btrfs_clean_io_failures(struct inode *inode, u64 start)
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
 {
 	u64 private;
 	u64 private_failure;
@@ -1675,7 +1675,7 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
  * if there's a match, we allow the bio to finish.  If not, we go through
  * the io_failure_record routines to find good copies
  */
-int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
@@ -4362,8 +4362,8 @@ out:
  * Invalidate a single dcache entry at the root of the filesystem.
  * Needed after creation of snapshot or subvolume.
  */
-void btrfs_invalidate_dcache_root(struct inode *dir, char *name,
-				  int namelen)
+static void btrfs_invalidate_dcache_root(struct inode *dir,
+		char *name, int namelen)
 {
 	struct dentry *alias, *entry;
 	struct qstr qstr;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 35f650e183e..cc7c5161e26 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -354,7 +354,7 @@ out_unlock:
 }
 
 
-int btrfs_defrag_file(struct file *file)
+static int btrfs_defrag_file(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -649,7 +649,7 @@ static int btrfs_ioctl_defrag(struct file *file)
 	return 0;
 }
 
-long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -671,7 +671,7 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -696,8 +696,8 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
-		       u64 olen, u64 destoff)
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+		u64 off, u64 olen, u64 destoff)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1035,7 +1035,7 @@ out_fput:
 	return ret;
 }
 
-long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
+static long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
 {
 	struct btrfs_ioctl_clone_range_args args;
 
@@ -1051,7 +1051,7 @@ long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
  * basically own the machine, and have a very in depth understanding
  * of all the possible deadlocks and enospc problems.
  */
-long btrfs_ioctl_trans_start(struct file *file)
+static long btrfs_ioctl_trans_start(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index dbe20d4c6ea..f99335a999d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -276,6 +276,7 @@ out:
 	return ret;
 }
 
+#if 0 /* this will get used when snapshot deletion is implemented */
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
 		       u64 root_id, u8 type, u64 ref_id)
@@ -299,6 +300,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 	btrfs_free_path(path);
 	return ret;
 }
+#endif
 
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 		   struct btrfs_path *path,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1975ea273dc..93a21c77064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -647,7 +647,7 @@ static int btrfs_interface_init(void)
 	return misc_register(&btrfs_misc);
 }
 
-void btrfs_interface_exit(void)
+static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
 		printk("misc_deregister failed for control device");
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index be4fc30a30e..4fcfc8b1189 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -23,6 +23,7 @@
 #include "locking.h"
 #include "print-tree.h"
 #include "compat.h"
+#include "tree-log.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -78,7 +79,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
  * tree of log tree roots.  This must be called with a tree log transaction
  * running (see start_log_trans).
  */
-int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root)
 {
 	struct btrfs_key key;
@@ -1934,7 +1935,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int wait_log_commit(struct btrfs_root *log)
+static int wait_log_commit(struct btrfs_root *log)
 {
 	DEFINE_WAIT(wait);
 	u64 transid = log->fs_info->tree_log_transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 724ead54529..769f2c5d9e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -238,7 +238,7 @@ done:
 	return 0;
 }
 
-void pending_bios_fn(struct btrfs_work *work)
+static void pending_bios_fn(struct btrfs_work *work)
 {
 	struct btrfs_device *device;
 
@@ -686,7 +686,7 @@ error:
 	return ret;
 }
 
-int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 			  struct btrfs_device *device,
 			  u64 start)
 {
@@ -1393,7 +1393,7 @@ error:
 	goto out;
 }
 
-int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
 				 struct btrfs_device *device)
 {
 	int ret;
@@ -1497,7 +1497,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -1543,7 +1543,7 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-int btrfs_relocate_chunk(struct btrfs_root *root,
+static int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
 {
@@ -1884,7 +1884,7 @@ done:
 	return ret;
 }
 
-int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 5b9f7002513..c4617cde6c7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include "compression.h"
 
 /* Plan: call deflate() with avail_in == *sourcelen,
 	avail_out = *dstlen - 12 and flush == Z_FINISH.
-- 
cgit v1.2.3-70-g09d2


From 607d432da0542e84ddcd358adfddac6f68500e3d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 2 Dec 2008 07:17:45 -0500
Subject: Btrfs: add support for multiple csum algorithms

This patch gives us the space we will need in order to have different csum
algorithims at some point in the future.  We save the csum algorithim type
in the superblock, and use those instead of define's.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.h     | 19 +++++++++++++++++-
 fs/btrfs/disk-io.c   | 25 ++++++++++++++++++-----
 fs/btrfs/file-item.c | 56 ++++++++++++++++++++++++++++++----------------------
 fs/btrfs/ioctl.c     |  9 +++++----
 fs/btrfs/tree-log.c  | 10 ++++++----
 5 files changed, 81 insertions(+), 38 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b5af1fc77c5..6d8350332b1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -109,8 +109,14 @@ struct btrfs_ordered_sum;
 
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32	0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
 /* four bytes for CRC32 */
-#define BTRFS_CRC32_SIZE 4
+//#define BTRFS_CRC32_SIZE 4
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_FT_UNKNOWN	0
@@ -308,6 +314,7 @@ struct btrfs_super_block {
 	__le64 compat_flags;
 	__le64 compat_ro_flags;
 	__le64 incompat_flags;
+	__le16 csum_type;
 	u8 root_level;
 	u8 chunk_root_level;
 	u8 log_root_level;
@@ -1483,6 +1490,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 			 last_snapshot, 64);
 
 /* struct btrfs_super_block */
+
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -1524,6 +1532,15 @@ BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
 			 compat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
 			 incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+			 csum_type, 16);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+	int t = btrfs_super_csum_type(s);
+	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	return btrfs_csum_sizes[t];
+}
 
 static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfd5ba05ce4..3eb7c2576fe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -176,7 +176,9 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
-	char result[BTRFS_CRC32_SIZE];
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
 	unsigned long offset = BTRFS_CSUM_SIZE;
@@ -186,6 +188,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	unsigned long map_len;
 	int err;
 	u32 crc = ~(u32)0;
+	unsigned long inline_result;
 
 	len = buf->len - offset;
 	while(len > 0) {
@@ -204,25 +207,37 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 		offset += cur_len;
 		unmap_extent_buffer(buf, map_token, KM_USER0);
 	}
+	if (csum_size > sizeof(inline_result)) {
+		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+		if (!result)
+			return 1;
+	} else {
+		result = (char *)&inline_result;
+	}
+
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
 		/* FIXME, this is not good */
-		if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
+		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 			u32 val;
 			u32 found = 0;
-			memcpy(&found, result, BTRFS_CRC32_SIZE);
+			memcpy(&found, result, csum_size);
 
-			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
+			read_extent_buffer(buf, &val, 0, csum_size);
 			printk("btrfs: %s checksum verify failed on %llu "
 			       "wanted %X found %X level %d\n",
 			       root->fs_info->sb->s_id,
 			       buf->start, val, found, btrfs_header_level(buf));
+			if (result != (char *)&inline_result)
+				kfree(result);
 			return 1;
 		}
 	} else {
-		write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE);
+		write_extent_buffer(buf, result, 0, csum_size);
 	}
+	if (result != (char *)&inline_result)
+		kfree(result);
 	return 0;
 }
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f7637883140..234ed441736 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,9 +24,9 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
-			       sizeof(struct btrfs_item) * 2) / \
-			       BTRFS_CRC32_SIZE) - 1))
+#define MAX_CSUM_ITEMS(r,size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+				   sizeof(struct btrfs_item) * 2) / \
+				  size) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -83,6 +83,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int csums_in_item;
 
 	file_key.objectid = objectid;
@@ -105,7 +107,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		csum_offset = (offset - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
-		csums_in_item /= BTRFS_CRC32_SIZE;
+		csums_in_item /= csum_size;
 
 		if (csum_offset >= csums_in_item) {
 			ret = -EFBIG;
@@ -114,7 +116,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	}
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
-					  csum_offset * BTRFS_CRC32_SIZE);
+					  csum_offset * csum_size);
 	return item;
 fail:
 	if (ret > 0)
@@ -150,6 +152,8 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u32 diff;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item = NULL;
@@ -195,7 +199,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			item_size = btrfs_item_size_nr(path->nodes[0],
 						       path->slots[0]);
 			item_last_offset = item_start_offset +
-				(item_size / BTRFS_CRC32_SIZE) *
+				(item_size / csum_size) *
 				root->sectorsize;
 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 					      struct btrfs_csum_item);
@@ -206,11 +210,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 		 */
 		diff = offset - item_start_offset;
 		diff = diff / root->sectorsize;
-		diff = diff * BTRFS_CRC32_SIZE;
+		diff = diff * csum_size;
 
 		read_extent_buffer(path->nodes[0], &sum,
 				   ((unsigned long)item) + diff,
-				   BTRFS_CRC32_SIZE);
+				   csum_size);
 found:
 		set_state_private(io_tree, offset, sum);
 		bio_index++;
@@ -383,6 +387,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	char *eb_token;
 	unsigned long map_len;
 	unsigned long map_start;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -408,7 +414,8 @@ again:
 		/* we found one, but it isn't big enough yet */
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-		if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
+		if ((item_size / csum_size) >=
+		    MAX_CSUM_ITEMS(root, csum_size)) {
 			/* already at max size, make a new one */
 			goto insert;
 		}
@@ -441,7 +448,7 @@ again:
 	 */
 	btrfs_release_path(root, path);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
-				BTRFS_CRC32_SIZE, 1);
+				csum_size, 1);
 	if (ret < 0)
 		goto fail_unlock;
 	if (ret == 0) {
@@ -457,14 +464,14 @@ again:
 			root->fs_info->sb->s_blocksize_bits;
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 	    found_key.objectid != objectid ||
-	    csum_offset >= MAX_CSUM_ITEMS(root)) {
+	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
-	    BTRFS_CRC32_SIZE) {
-		u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
+	    csum_size) {
+		u32 diff = (csum_offset + 1) * csum_size;
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != BTRFS_CRC32_SIZE)
+		if (diff != csum_size)
 			goto insert;
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
@@ -479,10 +486,10 @@ insert:
 		tmp -= offset & ~((u64)root->sectorsize -1);
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
 		tmp = max((u64)1, tmp);
-		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
-		ins_size = BTRFS_CRC32_SIZE * tmp;
+		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+		ins_size = csum_size * tmp;
 	} else {
-		ins_size = BTRFS_CRC32_SIZE;
+		ins_size = csum_size;
 	}
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      ins_size);
@@ -497,7 +504,7 @@ csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	ret = 0;
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
-					  csum_offset * BTRFS_CRC32_SIZE);
+					  csum_offset * csum_size);
 found:
 	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
@@ -508,14 +515,14 @@ found:
 next_sector:
 
 	if (!eb_token ||
-	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
+	   (unsigned long)item + csum_size >= map_start + map_len) {
 		int err;
 
 		if (eb_token)
 			unmap_extent_buffer(leaf, eb_token, KM_USER1);
 		eb_token = NULL;
 		err = map_private_extent_buffer(leaf, (unsigned long)item,
-						BTRFS_CRC32_SIZE,
+						csum_size,
 						&eb_token, &eb_map,
 						&map_start, &map_len, KM_USER1);
 		if (err)
@@ -523,17 +530,17 @@ next_sector:
 	}
 	if (eb_token) {
 		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       &sector_sum->sum, BTRFS_CRC32_SIZE);
+		       &sector_sum->sum, csum_size);
 	} else {
 		write_extent_buffer(leaf, &sector_sum->sum,
-				    (unsigned long)item, BTRFS_CRC32_SIZE);
+				    (unsigned long)item, csum_size);
 	}
 
 	total_bytes += root->sectorsize;
 	sector_sum++;
 	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
-						  BTRFS_CRC32_SIZE);
+						  csum_size);
 		if (item < item_end && offset + PAGE_CACHE_SIZE ==
 		    sector_sum->offset) {
 			    offset = sector_sum->offset;
@@ -577,7 +584,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	new_item_span = isize - key.offset;
 	blocks = (new_item_span + root->sectorsize - 1) >>
 		root->fs_info->sb->s_blocksize_bits;
-	new_item_size = blocks * BTRFS_CRC32_SIZE;
+	new_item_size = blocks *
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	if (new_item_size >= btrfs_item_size_nr(leaf, slot))
 		return 0;
 	ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index caea9eed9d6..b4da53d55c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -714,7 +714,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
-
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	/*
 	 * TODO:
 	 * - split compressed inline extents.  annoying: we need to
@@ -964,7 +965,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			int coff, clen;
 
 			size = btrfs_item_size_nr(leaf, slot);
-			coverslen = (size / BTRFS_CRC32_SIZE) <<
+			coverslen = (size / csum_size) <<
 				root->fs_info->sb->s_blocksize_bits;
 			printk("csums for %llu~%llu\n",
 			       key.offset, coverslen);
@@ -981,12 +982,12 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			if (off > key.offset)
 				coff = ((off - key.offset) >>
 					root->fs_info->sb->s_blocksize_bits) *
-					BTRFS_CRC32_SIZE;
+					csum_size;
 			clen = size - coff;
 			if (key.offset + coverslen > off+len)
 				clen -= ((key.offset+coverslen-off-len) >>
 					 root->fs_info->sb->s_blocksize_bits) *
-					BTRFS_CRC32_SIZE;
+					csum_size;
 			printk(" will dup %d~%d of %d\n",
 			       coff, clen, size);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4fcfc8b1189..c766649ad45 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -929,13 +929,15 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	int ret;
 	u32 item_size = btrfs_item_size_nr(eb, slot);
 	u64 cur_offset;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	unsigned long file_bytes;
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
 	struct inode *inode;
 	unsigned long ptr;
 
-	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
+	file_bytes = (item_size / csum_size) * root->sectorsize;
 	inode = read_one_inode(root, key->objectid);
 	if (!inode) {
 		return -EIO;
@@ -959,10 +961,10 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	while(item_size > 0) {
 		sector_sum->offset = cur_offset;
-		read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
+		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
-		item_size -= BTRFS_CRC32_SIZE;
-		ptr += BTRFS_CRC32_SIZE;
+		item_size -= csum_size;
+		ptr += csum_size;
 		cur_offset += root->sectorsize;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From d20f7043fa65659136c1a7c3c456eeeb5c6f431f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:58:54 -0500
Subject: Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c  | 124 ++++++++++++++++++++++++++++----
 fs/btrfs/ctree.h        |  30 +++++---
 fs/btrfs/disk-io.c      |  45 ++++++++++--
 fs/btrfs/extent_io.c    |   5 ++
 fs/btrfs/file-item.c    | 185 ++++++++++++++++++------------------------------
 fs/btrfs/inode.c        |  45 ++++++------
 fs/btrfs/ioctl.c        |  55 +-------------
 fs/btrfs/ordered-data.c |   7 +-
 fs/btrfs/ordered-data.h |  10 +--
 fs/btrfs/tree-log.c     | 121 +++++++++++++++++++++++++++----
 fs/btrfs/volumes.c      |   1 +
 11 files changed, 387 insertions(+), 241 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4febe2eb0b8..ad727413730 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -69,11 +69,27 @@ struct compressed_bio {
 
 	/* IO errors */
 	int errors;
+	int mirror_num;
 
 	/* for reads, this is the bio we are copying the data into */
 	struct bio *orig_bio;
+
+	/*
+	 * the start of a variable length array of checksums only
+	 * used by reads
+	 */
+	u32 sums;
 };
 
+static inline int compressed_bio_size(struct btrfs_root *root,
+				      unsigned long disk_size)
+{
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	return sizeof(struct compressed_bio) +
+		((disk_size + root->sectorsize - 1) / root->sectorsize) *
+		csum_size;
+}
+
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
 					u64 first_byte, gfp_t gfp_flags)
 {
@@ -96,6 +112,47 @@ static struct bio *compressed_bio_alloc(struct block_device *bdev,
 	return bio;
 }
 
+static int check_compressed_csum(struct inode *inode,
+				 struct compressed_bio *cb,
+				 u64 disk_start)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *page;
+	unsigned long i;
+	char *kaddr;
+	u32 csum;
+	u32 *cb_sum = &cb->sums;
+
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	for (i = 0; i < cb->nr_pages; i++) {
+		page = cb->compressed_pages[i];
+		csum = ~(u32)0;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		btrfs_csum_final(csum, (char *)&csum);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		if (csum != *cb_sum) {
+			printk("btrfs csum failed ino %lu extent %llu csum %u "
+			       "wanted %u mirror %d\n", inode->i_ino,
+			       (unsigned long long)disk_start,
+			       csum, *cb_sum, cb->mirror_num);
+			ret = -EIO;
+			goto fail;
+		}
+		cb_sum++;
+
+	}
+	ret = 0;
+fail:
+	return ret;
+}
+
 /* when we finish reading compressed pages from the disk, we
  * decompress them and then run the bio end_io routines on the
  * decompressed pages (in the inode address space).
@@ -124,16 +181,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	if (!atomic_dec_and_test(&cb->pending_bios))
 		goto out;
 
+	inode = cb->inode;
+	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	if (ret)
+		goto csum_failed;
+
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	inode = cb->inode;
 	tree = &BTRFS_I(inode)->io_tree;
 	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
 					cb->start,
 					cb->orig_bio->bi_io_vec,
 					cb->orig_bio->bi_vcnt,
 					cb->compressed_len);
+csum_failed:
 	if (ret)
 		cb->errors = 1;
 
@@ -148,8 +210,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* do io completion on the original bio */
 	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
-	} else
+	} else {
+		int bio_index = 0;
+		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+		/*
+		 * we have verified the checksum already, set page
+		 * checked so the end_io handlers know about it
+		 */
+		while(bio_index < cb->orig_bio->bi_vcnt) {
+			SetPageChecked(bvec->bv_page);
+			bvec++;
+			bio_index++;
+		}
 		bio_endio(cb->orig_bio, 0);
+	}
 
 	/* finally free the cb struct */
 	kfree(cb->compressed_pages);
@@ -277,12 +352,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	int ret;
 
 	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
-	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
 	cb->start = start;
 	cb->len = len;
+	cb->mirror_num = 0;
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
 	cb->orig_bio = NULL;
@@ -290,9 +366,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-	ret = btrfs_csum_file_bytes(root, inode, start, len);
-	BUG_ON(ret);
-
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
@@ -325,6 +398,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 			BUG_ON(ret);
 
+			ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+			BUG_ON(ret);
+
 			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
 			BUG_ON(ret);
 
@@ -348,6 +424,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
+	ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+	BUG_ON(ret);
+
 	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
 	BUG_ON(ret);
 
@@ -510,6 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_start;
 	struct extent_map *em;
 	int ret;
+	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
 	em_tree = &BTRFS_I(inode)->extent_tree;
@@ -521,15 +601,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				   PAGE_CACHE_SIZE);
 	spin_unlock(&em_tree->lock);
 
-	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	compressed_len = em->block_len;
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
+	cb->mirror_num = mirror_num;
+	sums = &cb->sums;
 
 	cb->start = em->orig_start;
-	compressed_len = em->block_len;
 	em_len = em->len;
 	em_start = em->start;
+
 	free_extent_map(em);
 	em = NULL;
 
@@ -551,11 +634,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	add_ra_bio_pages(inode, em_start + em_len, cb);
 
-	if (!btrfs_test_opt(root, NODATASUM) &&
-	    !btrfs_test_flag(inode, NODATASUM)) {
-		btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
-	}
-
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
 	cb->len = uncompressed_len;
@@ -568,6 +646,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	for (page_index = 0; page_index < nr_pages; page_index++) {
 		page = cb->compressed_pages[page_index];
 		page->mapping = inode->i_mapping;
+		page->index = em_start >> PAGE_CACHE_SHIFT;
+
 		if (comp_bio->bi_size)
 			ret = tree->ops->merge_bio_hook(page, 0,
 							PAGE_CACHE_SIZE,
@@ -591,7 +671,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			if (!btrfs_test_opt(root, NODATASUM) &&
+			    !btrfs_test_flag(inode, NODATASUM)) {
+				btrfs_lookup_bio_sums(root, inode, comp_bio,
+						      sums);
+			}
+			sums += (comp_bio->bi_size + root->sectorsize - 1) /
+				root->sectorsize;
+
+			ret = btrfs_map_bio(root, READ, comp_bio,
+					    mirror_num, 0);
 			BUG_ON(ret);
 
 			bio_put(comp_bio);
@@ -610,7 +699,12 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	if (!btrfs_test_opt(root, NODATASUM) &&
+	    !btrfs_test_flag(inode, NODATASUM)) {
+		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+	}
+
+	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
 	BUG_ON(ret);
 
 	bio_put(comp_bio);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96f2ec7ad5b..242b961ae6d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -73,6 +73,9 @@ struct btrfs_ordered_sum;
 /* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
 
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -84,6 +87,13 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_RELOC_OBJECTID -8ULL
 #define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
 
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -634,6 +644,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
+	struct btrfs_root *csum_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -716,6 +727,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers workers;
 	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	/*
@@ -858,13 +870,12 @@ struct btrfs_root {
  * extent data is for file data
  */
 #define BTRFS_EXTENT_DATA_KEY	108
+
 /*
- * csum items have the checksums for data in the extents
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
  */
-#define BTRFS_CSUM_ITEM_KEY	120
-
-
-/* reserve 21-31 for other file/dir stuff */
+#define BTRFS_EXTENT_CSUM_KEY	128
 
 /*
  * root items point to tree roots.  There are typically in the root
@@ -1917,7 +1928,7 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 
 /* file-item.c */
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio);
+			  struct bio *bio, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -1929,17 +1940,16 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct inode *inode,
+			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
-		       struct bio *bio);
+		       struct bio *bio, u64 file_start, int contig);
 int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
 			  u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
-					  u64 objectid, u64 offset,
-					  int cow);
+					  u64 bytenr, int cow);
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3eb7c2576fe..61dc3b2c834 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -445,11 +445,18 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
-	if (bio->bi_rw & (1 << BIO_RW))
+
+	if (bio->bi_rw & (1 << BIO_RW)) {
 		btrfs_queue_worker(&fs_info->endio_write_workers,
 				   &end_io_wq->work);
-	else
-		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+	} else {
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_workers,
+					   &end_io_wq->work);
+	}
 }
 
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -1208,6 +1215,9 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
+
 		bdi = blk_get_backing_dev_info(device->bdev);
 		if (bdi->unplug_io_fn) {
 			bdi->unplug_io_fn(bdi, page);
@@ -1344,7 +1354,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	 * blocksize <= pagesize, it is basically a noop
 	 */
 	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-		btrfs_queue_worker(&fs_info->endio_workers,
+		btrfs_queue_worker(&fs_info->endio_meta_workers,
 				   &end_io_wq->work);
 		return;
 	}
@@ -1454,6 +1464,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
+	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
 	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
@@ -1470,7 +1482,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_super_block *disk_super;
 
 	if (!extent_root || !tree_root || !fs_info ||
-	    !chunk_root || !dev_root) {
+	    !chunk_root || !dev_root || !csum_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -1487,6 +1499,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
+	fs_info->csum_root = csum_root;
 	fs_info->chunk_root = chunk_root;
 	fs_info->dev_root = dev_root;
 	fs_info->fs_devices = fs_devices;
@@ -1652,6 +1665,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
 			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+			   fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size);
 
@@ -1667,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_meta_workers,
+			    fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
 
@@ -1751,6 +1768,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret)
 		goto fail_extent_root;
 
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+	if (ret)
+		goto fail_extent_root;
+
+	csum_root->track_dirty = 1;
+
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = generation + 1;
@@ -1761,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (!fs_info->cleaner_kthread)
-		goto fail_extent_root;
+		goto fail_csum_root;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -1825,6 +1849,8 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
+fail_csum_root:
+	free_extent_buffer(csum_root->node);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
@@ -1838,6 +1864,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
@@ -1853,6 +1880,7 @@ fail:
 	kfree(fs_info);
 	kfree(chunk_root);
 	kfree(dev_root);
+	kfree(csum_root);
 	return ERR_PTR(err);
 }
 
@@ -2131,6 +2159,9 @@ int close_ctree(struct btrfs_root *root)
 	if (root->fs_info->dev_root->node);
 		free_extent_buffer(root->fs_info->dev_root->node);
 
+	if (root->fs_info->csum_root->node);
+		free_extent_buffer(root->fs_info->csum_root->node);
+
 	btrfs_free_block_groups(root->fs_info);
 
 	del_fs_roots(fs_info);
@@ -2141,6 +2172,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
@@ -2163,6 +2195,7 @@ int close_ctree(struct btrfs_root *root)
 	kfree(fs_info->tree_root);
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c3dfe2a0ec8..7449ecf32c5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1732,6 +1732,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	int whole_page;
 	int ret;
 
+	if (err)
+		uptodate = 0;
+
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1761,6 +1764,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				if (err)
+					uptodate = 0;
 				continue;
 			}
 		}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 234ed441736..a3ad2ce0011 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -74,8 +74,7 @@ out:
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
-					  u64 objectid, u64 offset,
-					  int cow)
+					  u64 bytenr, int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -87,9 +86,9 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int csums_in_item;
 
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
 	if (ret < 0)
 		goto fail;
@@ -100,11 +99,10 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 			goto fail;
 		path->slots[0]--;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
-		    found_key.objectid != objectid) {
+		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
 			goto fail;
-		}
-		csum_offset = (offset - found_key.offset) >>
+
+		csum_offset = (bytenr - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
@@ -143,7 +141,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio)
+			  struct bio *bio, u32 *dst)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
@@ -151,6 +149,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	u64 offset;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
+	u64 disk_bytenr;
 	u32 diff;
 	u16 csum_size =
 		btrfs_super_csum_size(&root->fs_info->super_copy);
@@ -165,21 +164,22 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
+	disk_bytenr = (u64)bio->bi_sector << 9;
 	while(bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		ret = btrfs_find_ordered_sum(inode, offset, &sum);
+		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
 
-		if (!item || offset < item_start_offset ||
-		    offset >= item_last_offset) {
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
 			struct btrfs_key found_key;
 			u32 item_size;
 
 			if (item)
 				btrfs_release_path(root, path);
-			item = btrfs_lookup_csum(NULL, root, path,
-						 inode->i_ino, offset, 0);
+			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
 				ret = PTR_ERR(item);
 				if (ret == -ENOENT || ret == -EFBIG)
@@ -208,7 +208,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 		 * this byte range must be able to fit inside
 		 * a single leaf so it will also fit inside a u32
 		 */
-		diff = offset - item_start_offset;
+		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
 
@@ -216,7 +216,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 				   ((unsigned long)item) + diff,
 				   csum_size);
 found:
-		set_state_private(io_tree, offset, sum);
+		if (dst)
+			*dst++ = sum;
+		else
+			set_state_private(io_tree, offset, sum);
+		disk_bytenr += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -224,75 +228,8 @@ found:
 	return 0;
 }
 
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
-			  u64 start, unsigned long len)
-{
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	struct btrfs_ordered_extent *ordered;
-	char *data;
-	struct page *page;
-	unsigned long total_bytes = 0;
-	unsigned long this_sum_bytes = 0;
-
-	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
-	if (!sums)
-		return -ENOMEM;
-
-	sector_sum = sums->sums;
-	sums->file_offset = start;
-	sums->len = len;
-	INIT_LIST_HEAD(&sums->list);
-	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
-	BUG_ON(!ordered);
-
-	while(len > 0) {
-		if (start >= ordered->file_offset + ordered->len ||
-		    start < ordered->file_offset) {
-			sums->len = this_sum_bytes;
-			this_sum_bytes = 0;
-			btrfs_add_ordered_sum(inode, ordered, sums);
-			btrfs_put_ordered_extent(ordered);
-
-			sums = kzalloc(btrfs_ordered_sum_size(root, len),
-				       GFP_NOFS);
-			BUG_ON(!sums);
-			sector_sum = sums->sums;
-			sums->len = len;
-			sums->file_offset = start;
-			ordered = btrfs_lookup_ordered_extent(inode,
-						      sums->file_offset);
-			BUG_ON(!ordered);
-		}
-
-		page = find_get_page(inode->i_mapping,
-				     start >> PAGE_CACHE_SHIFT);
-
-		data = kmap_atomic(page, KM_USER0);
-		sector_sum->sum = ~(u32)0;
-		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
-						  PAGE_CACHE_SIZE);
-		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(sector_sum->sum,
-				 (char *)&sector_sum->sum);
-		sector_sum->offset = page_offset(page);
-		page_cache_release(page);
-
-		sector_sum++;
-		total_bytes += PAGE_CACHE_SIZE;
-		this_sum_bytes += PAGE_CACHE_SIZE;
-		start += PAGE_CACHE_SIZE;
-
-		WARN_ON(len < PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-	}
-	btrfs_add_ordered_sum(inode, ordered, sums);
-	btrfs_put_ordered_extent(ordered);
-	return 0;
-}
-
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
-		       struct bio *bio)
+		       struct bio *bio, u64 file_start, int contig)
 {
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
@@ -303,6 +240,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	unsigned long total_bytes = 0;
 	unsigned long this_sum_bytes = 0;
 	u64 offset;
+	u64 disk_bytenr;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
@@ -310,16 +248,25 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		return -ENOMEM;
 
 	sector_sum = sums->sums;
-	sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+	disk_bytenr = (u64)bio->bi_sector << 9;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
-	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+
+	if (contig)
+		offset = file_start;
+	else
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	BUG_ON(!ordered);
+	sums->bytenr = ordered->start;
 
 	while(bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		if (offset >= ordered->file_offset + ordered->len ||
-		    offset < ordered->file_offset) {
+		if (!contig)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+		if (!contig && (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset)) {
 			unsigned long bytes_left;
 			sums->len = this_sum_bytes;
 			this_sum_bytes = 0;
@@ -333,10 +280,9 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			BUG_ON(!sums);
 			sector_sum = sums->sums;
 			sums->len = bytes_left;
-			sums->file_offset = offset;
-			ordered = btrfs_lookup_ordered_extent(inode,
-						      sums->file_offset);
+			ordered = btrfs_lookup_ordered_extent(inode, offset);
 			BUG_ON(!ordered);
+			sums->bytenr = ordered->start;
 		}
 
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
@@ -348,13 +294,14 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		kunmap_atomic(data, KM_USER0);
 		btrfs_csum_final(sector_sum->sum,
 				 (char *)&sector_sum->sum);
-		sector_sum->offset = page_offset(bvec->bv_page) +
-			bvec->bv_offset;
+		sector_sum->bytenr = disk_bytenr;
 
 		sector_sum++;
 		bio_index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
+		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bvec++;
 	}
 	this_sum_bytes = 0;
@@ -364,11 +311,10 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct inode *inode,
+			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
 {
-	u64 objectid = inode->i_ino;
-	u64 offset;
+	u64 bytenr;
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
@@ -396,13 +342,12 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
-	offset = sector_sum->offset;
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = sector_sum->bytenr;
+	bytenr = sector_sum->bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
-	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
+	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
 	if (!IS_ERR(item)) {
 		leaf = path->nodes[0];
 		ret = 0;
@@ -432,8 +377,8 @@ again:
 			slot = 0;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
-		if (found_key.objectid != objectid ||
-		    found_key.type != BTRFS_CSUM_ITEM_KEY) {
+		if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
 			found_next = 1;
 			goto insert;
 		}
@@ -460,10 +405,10 @@ again:
 	path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	csum_offset = (offset - found_key.offset) >>
+	csum_offset = (bytenr - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
-	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
-	    found_key.objectid != objectid ||
+	if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
 	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
 		goto insert;
 	}
@@ -482,8 +427,18 @@ insert:
 	btrfs_release_path(root, path);
 	csum_offset = 0;
 	if (found_next) {
-		u64 tmp = min((u64)i_size_read(inode), next_offset);
-		tmp -= offset & ~((u64)root->sectorsize -1);
+		u64 tmp = total_bytes + root->sectorsize;
+		u64 next_sector = sector_sum->bytenr;
+		struct btrfs_sector_sum *next = sector_sum + 1;
+
+		while(tmp < sums->len) {
+			if (next_sector + root->sectorsize != next->bytenr)
+				break;
+			tmp += root->sectorsize;
+			next_sector = next->bytenr;
+			next++;
+		}
+		tmp = min(tmp, next_offset - file_key.offset);
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
@@ -510,7 +465,6 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	cond_resched();
 next_sector:
 
@@ -541,9 +495,9 @@ next_sector:
 	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  csum_size);
-		if (item < item_end && offset + PAGE_CACHE_SIZE ==
-		    sector_sum->offset) {
-			    offset = sector_sum->offset;
+		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+		    sector_sum->bytenr) {
+			bytenr = sector_sum->bytenr;
 			goto next_sector;
 		}
 	}
@@ -562,7 +516,6 @@ out:
 	return ret;
 
 fail_unlock:
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	goto out;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09efc9473a3..c03d847b8c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1221,7 +1221,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	ret = btrfs_csum_one_bio(root, inode, bio);
+	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
 	BUG_ON(ret);
 	return 0;
 }
@@ -1259,12 +1259,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
-
-		if (bio_flags & EXTENT_BIO_COMPRESSED)
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
-		else if (!skip_sum)
-			btrfs_lookup_bio_sums(root, inode, bio);
+		} else if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio, NULL);
 		goto mapit;
 	} else if (!skip_sum) {
 		/* we're doing a write, do the async checksumming */
@@ -1292,8 +1291,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	list_for_each(cur, list) {
 		sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
-				       inode, sum);
+		btrfs_csum_file_blocks(trans,
+		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 	}
 	return 0;
 }
@@ -1545,6 +1544,7 @@ struct io_failure_record {
 	u64 start;
 	u64 len;
 	u64 logical;
+	unsigned long bio_flags;
 	int last_mirror;
 };
 
@@ -1563,7 +1563,6 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	int ret;
 	int rw;
 	u64 logical;
-	unsigned long bio_flags = 0;
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
@@ -1573,6 +1572,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		failrec->start = start;
 		failrec->len = end - start + 1;
 		failrec->last_mirror = 0;
+		failrec->bio_flags = 0;
 
 		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -1588,8 +1588,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		}
 		logical = start - em->start;
 		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-			bio_flags = EXTENT_BIO_COMPRESSED;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+		}
 		failrec->logical = logical;
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -1626,6 +1628,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = failed_bio->bi_bdev;
 	bio->bi_size = 0;
+
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 	if (failed_bio->bi_rw & (1 << BIO_RW))
 		rw = WRITE;
@@ -1634,7 +1637,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
 						      failrec->last_mirror,
-						      bio_flags);
+						      failrec->bio_flags);
 	return 0;
 }
 
@@ -1688,9 +1691,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	u32 csum = ~(u32)0;
 	unsigned long flags;
 
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		goto good;
+	}
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
+
 	if (state && state->start == start) {
 		private = state->private;
 		ret = 0;
@@ -1709,7 +1717,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
-
+good:
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
 	 */
@@ -2243,6 +2251,7 @@ fail:
 	return err;
 }
 
+#if 0
 /*
  * when truncating bytes in a file, it is possible to avoid reading
  * the leaves that contain only checksum items.  This can be the
@@ -2410,6 +2419,8 @@ out:
 	return ret;
 }
 
+#endif
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -2459,9 +2470,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	btrfs_init_path(path);
 
-	ret = drop_csum_leaves(trans, root, path, inode, new_size);
-	BUG_ON(ret);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
@@ -2509,16 +2517,11 @@ search_again:
 			}
 			item_end--;
 		}
-		if (found_type == BTRFS_CSUM_ITEM_KEY) {
-			ret = btrfs_csum_truncate(trans, root, path,
-						  new_size);
-			BUG_ON(ret);
-		}
 		if (item_end < new_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
-				found_type = BTRFS_CSUM_ITEM_KEY;
+				found_type = BTRFS_EXTENT_DATA_KEY;
 			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
 				found_type = BTRFS_XATTR_ITEM_KEY;
 			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b4da53d55c8..6228b69c2b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -714,8 +714,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+
 	/*
 	 * TODO:
 	 * - split compressed inline extents.  annoying: we need to
@@ -833,7 +832,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		slot = path->slots[0];
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
 		    key.objectid != src->i_ino)
 			break;
 
@@ -958,56 +957,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 
-		if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			u32 size;
-			struct btrfs_key new_key;
-			u64 coverslen;
-			int coff, clen;
-
-			size = btrfs_item_size_nr(leaf, slot);
-			coverslen = (size / csum_size) <<
-				root->fs_info->sb->s_blocksize_bits;
-			printk("csums for %llu~%llu\n",
-			       key.offset, coverslen);
-			if (key.offset + coverslen < off ||
-			    key.offset >= off+len)
-				goto next;
-
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
-			btrfs_release_path(root, path);
-
-			coff = 0;
-			if (off > key.offset)
-				coff = ((off - key.offset) >>
-					root->fs_info->sb->s_blocksize_bits) *
-					csum_size;
-			clen = size - coff;
-			if (key.offset + coverslen > off+len)
-				clen -= ((key.offset+coverslen-off-len) >>
-					 root->fs_info->sb->s_blocksize_bits) *
-					csum_size;
-			printk(" will dup %d~%d of %d\n",
-			       coff, clen, size);
-
-			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = inode->i_ino;
-			new_key.offset = key.offset + destoff - off;
-
-			ret = btrfs_insert_empty_item(trans, root, path,
-						      &new_key, clen);
-			if (ret)
-				goto out;
-
-			leaf = path->nodes[0];
-			slot = path->slots[0];
-			write_extent_buffer(leaf, buf + coff,
-					    btrfs_item_ptr_offset(leaf, slot),
-					    clen);
-			btrfs_mark_buffer_dirty(leaf);
-		}
-
 	next:
 		btrfs_release_path(root, path);
 		key.offset++;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 027ad6b3839..d9e232227da 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -610,7 +610,8 @@ out:
  * try to find a checksum.  This is used because we allow pages to
  * be reclaimed before their checksum is actually put into the btree
  */
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum)
 {
 	struct btrfs_ordered_sum *ordered_sum;
 	struct btrfs_sector_sum *sector_sums;
@@ -629,11 +630,11 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 	mutex_lock(&tree->mutex);
 	list_for_each_prev(cur, &ordered->list) {
 		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		if (offset >= ordered_sum->file_offset) {
+		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = ordered_sum->sums;
 			for (i = 0; i < num_sectors; i++) {
-				if (sector_sums[i].offset == offset) {
+				if (sector_sums[i].bytenr == disk_bytenr) {
 					*sum = sector_sums[i].sum;
 					ret = 0;
 					goto out;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 260bf95dfe0..ab66d5e8d6d 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -33,15 +33,17 @@ struct btrfs_ordered_inode_tree {
  * the ordered extent are on disk
  */
 struct btrfs_sector_sum {
-	u64 offset;
+	/* bytenr on disk */
+	u64 bytenr;
 	u32 sum;
 };
 
 struct btrfs_ordered_sum {
-	u64 file_offset;
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
 	/*
 	 * this is the length in bytes covered by the sums array below.
-	 * But, the sums array may not be contiguous in the file.
 	 */
 	unsigned long len;
 	struct list_head list;
@@ -147,7 +149,7 @@ struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
 int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c766649ad45..08469ec0585 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -934,24 +934,17 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	unsigned long file_bytes;
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
-	struct inode *inode;
 	unsigned long ptr;
 
 	file_bytes = (item_size / csum_size) * root->sectorsize;
-	inode = read_one_inode(root, key->objectid);
-	if (!inode) {
-		return -EIO;
-	}
-
 	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
 	if (!sums) {
-		iput(inode);
 		return -ENOMEM;
 	}
 
 	INIT_LIST_HEAD(&sums->list);
 	sums->len = file_bytes;
-	sums->file_offset = key->offset;
+	sums->bytenr = key->offset;
 
 	/*
 	 * copy all the sums into the ordered sum struct
@@ -960,7 +953,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	cur_offset = key->offset;
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	while(item_size > 0) {
-		sector_sum->offset = cur_offset;
+		sector_sum->bytenr = cur_offset;
 		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
 		item_size -= csum_size;
@@ -969,11 +962,9 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	}
 
 	/* let btrfs_csum_file_blocks add them into the file */
-	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
+	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
 	BUG_ON(ret);
 	kfree(sums);
-	iput(inode);
-
 	return 0;
 }
 /*
@@ -1670,7 +1661,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
 			BUG_ON(ret);
-		} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
+		} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
 			ret = replay_one_csum(wc->trans, root, path,
 					      eb, i, &key);
 			BUG_ON(ret);
@@ -2466,6 +2457,85 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
+				      struct list_head *list,
+				      struct btrfs_root *root,
+				      u64 disk_bytenr, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	u64 end = disk_bytenr + len;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u32 diff;
+	u32 sum;
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+
+	sector_sum = sums->sums;
+	sums->bytenr = disk_bytenr;
+	sums->len = len;
+	list_add_tail(&sums->list, list);
+
+	path = btrfs_alloc_path();
+	while(disk_bytenr < end) {
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(root, path);
+			item = btrfs_lookup_csum(NULL, root, path,
+						 disk_bytenr, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				printk("log no csum found for byte %llu\n",
+				       (unsigned long long)disk_bytenr);
+				item = NULL;
+				btrfs_release_path(root, path);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / csum_size) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = disk_bytenr - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * csum_size;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   ((unsigned long)item) + diff,
+				   csum_size);
+found:
+		sector_sum->bytenr = disk_bytenr;
+		sector_sum->sum = sum;
+		disk_bytenr += root->sectorsize;
+		sector_sum++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *log,
 			       struct btrfs_path *dst_path,
@@ -2481,6 +2551,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	u32 *ins_sizes;
 	char *ins_data;
 	int i;
+	struct list_head ordered_sums;
+
+	INIT_LIST_HEAD(&ordered_sums);
 
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
@@ -2535,6 +2608,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 								   extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
 								      extent);
+				u64 cs = btrfs_file_extent_offset(src, extent);
+				u64 cl = btrfs_file_extent_num_bytes(src,
+								     extent);;
 				/* ds == 0 is a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
@@ -2544,6 +2620,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   trans->transid,
 						   ins_keys[i].objectid);
 					BUG_ON(ret);
+					ret = copy_extent_csums(trans,
+						&ordered_sums,
+						log->fs_info->csum_root,
+						ds + cs, cl);
+					BUG_ON(ret);
 				}
 			}
 		}
@@ -2553,6 +2634,20 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
 	btrfs_release_path(log, dst_path);
 	kfree(ins_data);
+
+	/*
+	 * we have to do this after the loop above to avoid changing the
+	 * log tree while trying to change the log tree.
+	 */
+	while(!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		ret = btrfs_csum_file_blocks(trans, log, sums);
+		BUG_ON(ret);
+		list_del(&sums->list);
+		kfree(sums);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6c523b3360f..2049d179ccd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2771,6 +2771,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
+	INIT_LIST_HEAD(&device->dev_alloc_list);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
 	return device;
 }
-- 
cgit v1.2.3-70-g09d2


From a512bbf855ff0af474257475f2e6da7acd854f52 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 8 Dec 2008 16:46:26 -0500
Subject: Btrfs: superblock duplication

This patch implements superblock duplication. Superblocks
are stored at offset 16K, 64M and 256G on every devices.
Spaces used by superblocks are preserved by the allocator,
which uses a reverse mapping function to find the logical
addresses that correspond to superblocks. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c          | 208 ++++++++++++++++++++++++++++++--------------
 fs/btrfs/disk-io.h          |  17 +++-
 fs/btrfs/extent-tree.c      |  54 ++++++------
 fs/btrfs/free-space-cache.c |   1 -
 fs/btrfs/transaction.c      |   2 +-
 fs/btrfs/tree-log.c         |   3 +-
 fs/btrfs/volumes.c          | 107 ++++++++++++++++++-----
 fs/btrfs/volumes.h          |   6 +-
 8 files changed, 279 insertions(+), 119 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 61dc3b2c834..c72f4f3b912 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1595,8 +1595,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 
-	bh = __bread(fs_devices->latest_bdev,
-		     BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh)
 		goto fail_iput;
 
@@ -1710,7 +1709,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	mutex_lock(&fs_info->chunk_mutex);
-	ret = btrfs_read_sys_array(tree_root);
+	ret = btrfs_read_sys_array(tree_root, btrfs_super_bytenr(disk_super));
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
@@ -1905,19 +1904,147 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-static int write_all_supers(struct btrfs_root *root)
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+	struct buffer_head *bh;
+	struct buffer_head *latest = NULL;
+	struct btrfs_super_block *super;
+	int i;
+	u64 transid = 0;
+	u64 bytenr;
+
+	/* we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	for (i = 0; i < 1; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+			break;
+		bh = __bread(bdev, bytenr / 4096, 4096);
+		if (!bh)
+			continue;
+
+		super = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_bytenr(super) != bytenr ||
+		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+			    sizeof(super->magic))) {
+			brelse(bh);
+			continue;
+		}
+
+		if (!latest || btrfs_super_generation(super) > transid) {
+			brelse(latest);
+			latest = bh;
+			transid = btrfs_super_generation(super);
+		} else {
+			brelse(bh);
+		}
+	}
+	return latest;
+}
+
+static int write_dev_supers(struct btrfs_device *device,
+			    struct btrfs_super_block *sb,
+			    int do_barriers, int wait, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int ret;
+	int errors = 0;
+	u32 crc;
+	u64 bytenr;
+	int last_barrier = 0;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	/* make sure only the last submit_bh does a barrier */
+	if (do_barriers) {
+		for (i = 0; i < max_mirrors; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+			    device->total_bytes)
+				break;
+			last_barrier = i;
+		}
+	}
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+			break;
+
+		if (wait) {
+			bh = __find_get_block(device->bdev, bytenr / 4096,
+					      BTRFS_SUPER_INFO_SIZE);
+			BUG_ON(!bh);
+			brelse(bh);
+			wait_on_buffer(bh);
+			if (buffer_uptodate(bh)) {
+				brelse(bh);
+				continue;
+			}
+		} else {
+			btrfs_set_super_bytenr(sb, bytenr);
+
+			crc = ~(u32)0;
+			crc = btrfs_csum_data(NULL, (char *)sb +
+					      BTRFS_CSUM_SIZE, crc,
+					      BTRFS_SUPER_INFO_SIZE -
+					      BTRFS_CSUM_SIZE);
+			btrfs_csum_final(crc, sb->csum);
+
+			bh = __getblk(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+			set_buffer_uptodate(bh);
+			get_bh(bh);
+			lock_buffer(bh);
+			bh->b_end_io = btrfs_end_buffer_write_sync;
+		}
+
+		if (i == last_barrier && do_barriers && device->barriers) {
+			ret = submit_bh(WRITE_BARRIER, bh);
+			if (ret == -EOPNOTSUPP) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       device->name);
+				set_buffer_uptodate(bh);
+				device->barriers = 0;
+				get_bh(bh);
+				lock_buffer(bh);
+				ret = submit_bh(WRITE, bh);
+			}
+		} else {
+			ret = submit_bh(WRITE, bh);
+		}
+
+		if (!ret && wait) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+		} else if (ret) {
+			errors++;
+		}
+		if (wait)
+			brelse(bh);
+	}
+	return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
-	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
 	int max_errors;
 	int total_errors = 0;
-	u32 crc;
 	u64 flags;
 
 	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
@@ -1944,40 +2071,11 @@ static int write_all_supers(struct btrfs_root *root)
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
 		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
-
-		crc = ~(u32)0;
-		crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
-				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-		btrfs_csum_final(crc, sb->csum);
-
-		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
-			      BTRFS_SUPER_INFO_SIZE);
-
-		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-		dev->pending_io = bh;
-
-		get_bh(bh);
-		set_buffer_uptodate(bh);
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_write_sync;
-
-		if (do_barriers && dev->barriers) {
-			ret = submit_bh(WRITE_BARRIER, bh);
-			if (ret == -EOPNOTSUPP) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       dev->name);
-				set_buffer_uptodate(bh);
-				dev->barriers = 0;
-				get_bh(bh);
-				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
-			}
-		} else {
-			ret = submit_bh(WRITE, bh);
-		}
+		ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
 		if (ret)
 			total_errors++;
 	}
@@ -1985,8 +2083,8 @@ static int write_all_supers(struct btrfs_root *root)
 		printk("btrfs: %d errors while writing supers\n", total_errors);
 		BUG();
 	}
-	total_errors = 0;
 
+	total_errors = 0;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (!dev->bdev)
@@ -1994,29 +2092,9 @@ static int write_all_supers(struct btrfs_root *root)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		BUG_ON(!dev->pending_io);
-		bh = dev->pending_io;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(dev->pending_io)) {
-			if (do_barriers && dev->barriers) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       dev->name);
-				set_buffer_uptodate(bh);
-				get_bh(bh);
-				lock_buffer(bh);
-				dev->barriers = 0;
-				ret = submit_bh(WRITE, bh);
-				BUG_ON(ret);
-				wait_on_buffer(bh);
-				if (!buffer_uptodate(bh))
-					total_errors++;
-			} else {
-				total_errors++;
-			}
-
-		}
-		dev->pending_io = NULL;
-		brelse(bh);
+		ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+		if (ret)
+			total_errors++;
 	}
 	if (total_errors > max_errors) {
 		printk("btrfs: %d errors while writing supers\n", total_errors);
@@ -2025,12 +2103,12 @@ static int write_all_supers(struct btrfs_root *root)
 	return 0;
 }
 
-int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root)
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors)
 {
 	int ret;
 
-	ret = write_all_supers(root);
+	ret = write_all_supers(root, max_mirrors);
 	return ret;
 }
 
@@ -2116,7 +2194,7 @@ int btrfs_commit_super(struct btrfs_root *root)
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
 
-	ret = write_ctree_super(NULL, root);
+	ret = write_ctree_super(NULL, root, 0);
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 717e94811e4..c0ff404c31b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,8 +19,20 @@
 #ifndef __DISKIO__
 #define __DISKIO__
 
-#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
 #define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX	 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+	u64 start = 16 * 1024;
+	if (mirror)
+		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+	return BTRFS_SUPER_INFO_OFFSET;
+}
+
 struct btrfs_device;
 struct btrfs_fs_devices;
 
@@ -37,7 +49,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			      char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root);
+		      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d1563852938..803647bc840 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -189,6 +189,29 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return 0;
 }
 
+static int remove_sb_from_cache(struct btrfs_root *root,
+				struct btrfs_block_group_cache *cache)
+{
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr, 0,
+				       &logical, &nr, &stripe_len);
+		BUG_ON(ret);
+		while (nr--) {
+			btrfs_remove_free_space(cache, logical[nr],
+						stripe_len);
+		}
+		kfree(logical);
+	}
+	return 0;
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -197,9 +220,7 @@ static int cache_block_group(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int slot;
-	u64 last = 0;
-	u64 first_free;
-	int found = 0;
+	u64 last = block_group->key.objectid;
 
 	if (!block_group)
 		return 0;
@@ -220,23 +241,13 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	first_free = max_t(u64, block_group->key.objectid,
-			   BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	key.objectid = block_group->key.objectid;
+	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
-	if (ret < 0)
-		goto err;
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid + key.offset > first_free)
-			first_free = key.objectid + key.offset;
-	}
+
 	while(1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -258,11 +269,6 @@ static int cache_block_group(struct btrfs_root *root,
 			break;
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-			if (!found) {
-				last = first_free;
-				found = 1;
-			}
-
 			add_new_free_space(block_group, root->fs_info, last,
 					   key.objectid);
 
@@ -272,13 +278,11 @@ next:
 		path->slots[0]++;
 	}
 
-	if (!found)
-		last = first_free;
-
 	add_new_free_space(block_group, root->fs_info, last,
 			   block_group->key.objectid +
 			   block_group->key.offset);
 
+	remove_sb_from_cache(root, block_group);
 	block_group->cached = 1;
 	ret = 0;
 err:
@@ -1974,10 +1978,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (alloc) {
 			old_val += num_bytes;
 			cache->space_info->bytes_used += num_bytes;
-			if (cache->ro) {
+			if (cache->ro)
 				cache->space_info->bytes_readonly -= num_bytes;
-				WARN_ON(1);
-			}
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 09462adfbe3..2e69b9c3043 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,6 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			ret = -EINVAL;
 			goto out;
 		}
-
 		unlink_free_space(block_group, info);
 
 		if (info->bytes == bytes) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c38f6a0e30b..47cd5fcad2c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1038,7 +1038,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
-	write_ctree_super(trans, root);
+	write_ctree_super(trans, root, 0);
 
 	/*
 	 * the super is written, we can safely allow the tree-loggers
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 08469ec0585..d3f9c2c663c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1996,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
 		       btrfs_header_level(log->fs_info->log_root_tree->node));
 
-	write_ctree_super(trans, log->fs_info->tree_root);
+	write_ctree_super(trans, log->fs_info->tree_root, 2);
 	log->fs_info->tree_log_transid++;
 	log->fs_info->tree_log_batch = 0;
 	atomic_set(&log->fs_info->tree_log_commit, 0);
@@ -2006,7 +2006,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 out:
 	mutex_unlock(&log->fs_info->tree_log_mutex);
 	return 0;
-
 }
 
 /* * free all the extents used by the tree log.  This should be called
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2049d179ccd..a79b3cc09e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -423,15 +423,11 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		}
 		set_blocksize(bdev, 4096);
 
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh)
 			goto error_close;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic)))
-			goto error_brelse;
-
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		if (devid != device->devid)
 			goto error_brelse;
@@ -529,17 +525,12 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(bdev);
 	if (!bh) {
 		ret = -EIO;
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -EINVAL;
-		goto error_brelse;
-	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
@@ -553,7 +544,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
-error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_exclusive(bdev, flags);
@@ -1016,17 +1006,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 
 		set_blocksize(bdev, 4096);
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh) {
 			ret = -EIO;
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-			    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		dev_uuid = disk_super->dev_item.uuid;
 		device = btrfs_find_device(root, devid, dev_uuid,
@@ -2563,6 +2548,88 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				 mirror_num, NULL);
 }
 
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	int i, j, nr = 0;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(!em || em->start != chunk_start);
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		do_div(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		do_div(length, map->num_stripes);
+
+	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	BUG_ON(!buf);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		do_div(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			do_div(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		}
+		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr)
+			buf[nr++] = bytenr;
+	}
+
+	for (i = 0; i > nr; i++) {
+		struct btrfs_multi_bio *multi;
+		struct btrfs_bio_stripe *stripe;
+		int ret;
+
+		length = 1;
+		ret = btrfs_map_block(map_tree, WRITE, buf[i],
+				      &length, &multi, 0);
+		BUG_ON(ret);
+
+		stripe = multi->stripes;
+		for (j = 0; j < multi->num_stripes; j++) {
+			if (stripe->physical >= physical &&
+			    physical < stripe->physical + length)
+				break;
+		}
+		BUG_ON(j >= multi->num_stripes);
+		kfree(multi);
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = map->stripe_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 		      u64 logical, struct page *page)
 {
@@ -3003,7 +3070,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root)
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3018,7 +3085,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+	sb = btrfs_find_create_tree_block(root, sb_bytenr,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fcbdcb3ae13..bdebe83c319 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -28,7 +28,6 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
-	struct buffer_head *pending_io;
 	struct bio *pending_bios;
 	struct bio *pending_bio_tail;
 	int running_pending;
@@ -125,7 +124,10 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
-int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type);
-- 
cgit v1.2.3-70-g09d2


From 580afd76e451deb6772d0507de580fb1df14da6c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 19:15:39 -0500
Subject: Btrfs: Fix compressed checksum fsync log copies

The fsync logging code makes sure to onl copy the relevant checksum for each
extent based on the file extent pointers it finds.

But for compressed extents, it needs to copy the checksum for the
entire extent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c     | 3 ++-
 fs/btrfs/tree-log.c | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b5a6a2b6f66..71bfe3a6a44 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1228,7 +1228,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	root->fs_info->tree_log_batch++;
-	filemap_fdatawait(inode->i_mapping);
+	filemap_fdatawrite(inode->i_mapping);
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	root->fs_info->tree_log_batch++;
 
 	/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d3f9c2c663c..6ac1b7f72e2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2610,6 +2610,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				u64 cs = btrfs_file_extent_offset(src, extent);
 				u64 cl = btrfs_file_extent_num_bytes(src,
 								     extent);;
+				if (btrfs_file_extent_compression(src,
+								  extent)) {
+					cs = 0;
+					cl = dl;
+				}
 				/* ds == 0 is a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
-- 
cgit v1.2.3-70-g09d2


From 87b29b208c6c38f3446d2de6ece946e2459052cf Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 17 Dec 2008 10:21:48 -0500
Subject: Btrfs: properly check free space for tree balancing

btrfs_insert_empty_items takes the space needed by the btrfs_item
structure into account when calculating the required free space.

So the tree balancing code shouldn't add sizeof(struct btrfs_item)
to the size when checking the free space. This patch removes these
superfluous additions.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c     | 52 ++++++++++++++++++++++++++--------------------------
 fs/btrfs/file-item.c |  4 ++++
 fs/btrfs/tree-log.c  |  9 ++-------
 3 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f6f7a6af035..7fad2e3ad6f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1587,8 +1587,8 @@ cow_done:
 				btrfs_tree_lock(b);
 		} else {
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
-			    sizeof(struct btrfs_item) + ins_len) {
+			if (ins_len > 0 &&
+			    btrfs_leaf_free_space(root, b) < ins_len) {
 				int sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
 				BUG_ON(sret > 0);
@@ -2231,7 +2231,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item))
+	if (free_space < data_size)
 		goto out_unlock;
 
 	/* cow and double check */
@@ -2241,7 +2241,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out_unlock;
 
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item))
+	if (free_space < data_size)
 		goto out_unlock;
 
 	left_nritems = btrfs_header_nritems(left);
@@ -2254,7 +2254,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		nr = 1;
 
 	if (path->slots[0] >= left_nritems)
-		push_space += data_size + sizeof(*item);
+		push_space += data_size;
 
 	i = left_nritems - 1;
 	while (i >= nr) {
@@ -2271,7 +2271,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		if (path->slots[0] == i)
-			push_space += data_size + sizeof(*item);
+			push_space += data_size;
 
 		if (!left->map_token) {
 			map_extent_buffer(left, (unsigned long)item,
@@ -2427,7 +2427,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
 	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
+	if (free_space < data_size) {
 		ret = 1;
 		goto out;
 	}
@@ -2442,7 +2442,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
+	if (free_space < data_size) {
 		ret = 1;
 		goto out;
 	}
@@ -2473,7 +2473,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		if (path->slots[0] == i)
-			push_space += data_size + sizeof(*item);
+			push_space += data_size;
 
 		this_item_size = btrfs_item_size(right, item);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
@@ -2510,7 +2510,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		     btrfs_item_offset_nr(right, push_items - 1),
 		     push_space);
 	old_left_nritems = btrfs_header_nritems(left);
-	BUG_ON(old_left_nritems < 0);
+	BUG_ON(old_left_nritems <= 0);
 
 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
@@ -2628,7 +2628,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int mid;
 	int slot;
 	struct extent_buffer *right;
-	int space_needed = data_size + sizeof(struct btrfs_item);
 	int data_copy_size;
 	int rt_data_off;
 	int i;
@@ -2638,9 +2637,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int num_doubles = 0;
 	struct btrfs_disk_key disk_key;
 
-	if (extend && data_size)
-		space_needed = data_size;
-
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
@@ -2655,7 +2651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		l = path->nodes[0];
 
 		/* did the pushes work? */
-		if (btrfs_leaf_free_space(root, l) >= space_needed)
+		if (btrfs_leaf_free_space(root, l) >= data_size)
 			return 0;
 	}
 
@@ -2694,7 +2690,7 @@ again:
 			    BTRFS_UUID_SIZE);
 	if (mid <= slot) {
 		if (nritems == 1 ||
-		    leaf_space_used(l, mid, nritems - mid) + space_needed >
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot >= nritems) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
@@ -2716,12 +2712,12 @@ again:
 			mid = slot;
 			if (mid != nritems &&
 			    leaf_space_used(l, mid, nritems - mid) +
-			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 				double_split = 1;
 			}
 		}
 	} else {
-		if (leaf_space_used(l, 0, mid + 1) + space_needed >
+		if (leaf_space_used(l, 0, mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (!extend && data_size && slot == 0) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
@@ -2750,7 +2746,7 @@ again:
 				mid = slot;
 				if (mid != nritems &&
 				    leaf_space_used(l, mid, nritems - mid) +
-				    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 					double_split = 1;
 				}
 			}
@@ -2883,7 +2879,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 		return -EAGAIN;
 	}
 
-	ret = split_leaf(trans, root, &orig_key, path, 0, 0);
+	ret = split_leaf(trans, root, &orig_key, path,
+			 sizeof(struct btrfs_item), 1);
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
@@ -3169,14 +3166,17 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key disk_key;
 	struct btrfs_key found_key;
 
-	found_key.objectid = 0;
-	nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));
-
-	for (i = 0; i < nr; i++)
+	for (i = 0; i < nr; i++) {
+		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			break;
+			nr = i;
+		}
 		total_data += data_size[i];
+		total_size += data_size[i] + sizeof(struct btrfs_item);
+	}
+	BUG_ON(nr == 0);
 
-	total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0)
 		return -EEXIST;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7acadf3b742..cc6e0b6de94 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -300,6 +300,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 
 		size = btrfs_item_size_nr(leaf, path->slots[0]);
 		csum_end = key.offset + (size / csum_size) * root->sectorsize;
+		if (csum_end <= start) {
+			path->slots[0]++;
+			continue;
+		}
 
 		size = min(csum_end, end + 1) - start;
 		sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6ac1b7f72e2..33eee256ee8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -374,13 +374,8 @@ insert:
 		if (found_size > item_size) {
 			btrfs_truncate_item(trans, root, path, item_size, 1);
 		} else if (found_size < item_size) {
-			ret = btrfs_del_item(trans, root,
-					     path);
-			BUG_ON(ret);
-
-			btrfs_release_path(root, path);
-			ret = btrfs_insert_empty_item(trans,
-				  root, path, key, item_size);
+			ret = btrfs_extend_item(trans, root, path,
+						item_size - found_size);
 			BUG_ON(ret);
 		}
 	} else if (ret) {
-- 
cgit v1.2.3-70-g09d2


From ec051c0f929afe5c42c24bb07abf577c616c208c Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 5 Jan 2009 15:43:42 -0500
Subject: Btrfs: avoid orphan inode caused by log replay

drop_one_dir_item does not properly update inode's link count. It can be
reproduced by executing following commands:

#touch test
#sync
#rm -f test
#dd if=/dev/zero bs=4k count=1 of=test conv=fsync
#echo b > /proc/sysrq-trigger

This fixes it by adding an BTRFS_ORPHAN_ITEM_KEY for the inode

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/tree-log.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 33eee256ee8..b1c2921f5be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -50,6 +50,9 @@
 static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -638,8 +641,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	inode = read_one_inode(root, location.objectid);
 	BUG_ON(!inode);
 
-	btrfs_inc_nlink(inode);
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	BUG_ON(ret);
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	BUG_ON(ret);
 	kfree(name);
 
 	iput(inode);
-- 
cgit v1.2.3-70-g09d2


From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 21:25:51 -0500
Subject: Btrfs: Fix checkpatch.pl warnings

There were many, most are fixed now.  struct-funcs.c generates some warnings
but these are bogus.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c              |   5 +-
 fs/btrfs/async-thread.c     |   6 +-
 fs/btrfs/compat.h           |   4 +-
 fs/btrfs/compression.c      |  12 +--
 fs/btrfs/ctree.c            | 217 ++++++++++++++++--------------------------
 fs/btrfs/ctree.h            |  30 +++---
 fs/btrfs/dir-item.c         |   2 +-
 fs/btrfs/disk-io.c          | 187 +++++++++++++------------------------
 fs/btrfs/export.c           |   8 +-
 fs/btrfs/extent-tree.c      | 223 +++++++++++++++++++++++---------------------
 fs/btrfs/extent_io.c        | 213 ++++++++++++++++++++----------------------
 fs/btrfs/extent_map.c       |  14 +--
 fs/btrfs/file-item.c        |  18 ++--
 fs/btrfs/file.c             |  49 +++++-----
 fs/btrfs/free-space-cache.c |  37 ++++----
 fs/btrfs/inode-map.c        |   1 -
 fs/btrfs/inode.c            | 173 +++++++++++++++++-----------------
 fs/btrfs/ioctl.c            |  37 ++++----
 fs/btrfs/locking.c          |   5 +-
 fs/btrfs/ordered-data.c     |  34 +++----
 fs/btrfs/print-tree.c       |  73 +++++++++------
 fs/btrfs/ref-cache.c        |  12 +--
 fs/btrfs/root-tree.c        |  17 ++--
 fs/btrfs/struct-funcs.c     |   4 +-
 fs/btrfs/super.c            |  25 +++--
 fs/btrfs/sysfs.c            |   6 +-
 fs/btrfs/transaction.c      |  45 ++++-----
 fs/btrfs/transaction.h      |   6 +-
 fs/btrfs/tree-defrag.c      |   9 +-
 fs/btrfs/tree-log.c         |  70 +++++++-------
 fs/btrfs/volumes.c          |  78 ++++++++--------
 fs/btrfs/xattr.c            |   3 +-
 fs/btrfs/zlib.c             |  45 ++++-----
 33 files changed, 770 insertions(+), 898 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 867eaf1f8ef..1d53b62dbba 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -161,8 +161,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	ret = __btrfs_setxattr(inode, name, value, size, 0);
 
 out:
-	if (value)
-		kfree(value);
+	kfree(value);
 
 	if (!ret)
 		btrfs_update_cached_acl(inode, p_acl, acl);
@@ -213,7 +212,7 @@ static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 }
 
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
-				       const void *value, size_t size, int flags)
+			       const void *value, size_t size, int flags)
 {
 	return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4229450b759..8e2fec05dbe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -104,7 +104,7 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
 	spin_lock_irqsave(&workers->lock, flags);
 
-	while(!list_empty(&workers->order_list)) {
+	while (!list_empty(&workers->order_list)) {
 		work = list_entry(workers->order_list.next,
 				  struct btrfs_work, order_list);
 
@@ -143,7 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
-		while(!list_empty(&worker->pending)) {
+		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
@@ -188,7 +188,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 	struct btrfs_worker_thread *worker;
 
 	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while(!list_empty(&workers->worker_list)) {
+	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
 				    worker_list);
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index 75e4426d6fb..594d60bdd3c 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -4,7 +4,7 @@
 #define btrfs_drop_nlink(inode) drop_nlink(inode)
 #define btrfs_inc_nlink(inode)	inc_nlink(inode)
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 27)
 static inline struct dentry *d_obtain_alias(struct inode *inode)
 {
 	struct dentry *d;
@@ -21,7 +21,7 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
 # define  __pagevec_lru_add_file __pagevec_lru_add
 # define open_bdev_exclusive open_bdev_excl
 # define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2436163d543..ee848d8585d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -137,7 +137,8 @@ static int check_compressed_csum(struct inode *inode,
 		kunmap_atomic(kaddr, KM_USER0);
 
 		if (csum != *cb_sum) {
-			printk("btrfs csum failed ino %lu extent %llu csum %u "
+			printk(KERN_INFO "btrfs csum failed ino %lu "
+			       "extent %llu csum %u "
 			       "wanted %u mirror %d\n", inode->i_ino,
 			       (unsigned long long)disk_start,
 			       csum, *cb_sum, cb->mirror_num);
@@ -217,7 +218,7 @@ csum_failed:
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
-		while(bio_index < cb->orig_bio->bi_vcnt) {
+		while (bio_index < cb->orig_bio->bi_vcnt) {
 			SetPageChecked(bvec->bv_page);
 			bvec++;
 			bio_index++;
@@ -246,7 +247,7 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
 	int i;
 	int ret;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
@@ -463,7 +464,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 
 	pagevec_init(&pvec, 0);
-	while(last_offset < compressed_end) {
+	while (last_offset < compressed_end) {
 		page_index = last_offset >> PAGE_CACHE_SHIFT;
 
 		if (page_index > end_index)
@@ -697,9 +698,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!btrfs_test_flag(inode, NODATASUM)) {
+	if (!btrfs_test_flag(inode, NODATASUM))
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
-	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
 	BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7fad2e3ad6f..9e46c077681 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -67,7 +67,7 @@ void btrfs_free_path(struct btrfs_path *p)
  *
  * It is safe to call this on paths that no locks or extent buffers held.
  */
-void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
 
@@ -112,7 +112,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
 
-	while(1) {
+	while (1) {
 		eb = btrfs_root_node(root);
 		btrfs_tree_lock(eb);
 
@@ -202,22 +202,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 
 /*
- * does the dirty work in cow of a single block.  The parent block
- * (if supplied) is updated to point to the new cow copy.  The new
- * buffer is marked dirty and returned locked.  If you modify the block
- * it needs to be marked dirty again.
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
  *
  * search_start -- an allocation hint for the new block
  *
- * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
- * the allocator should try to find free next to the block it returns.  This is
- * just a hint and may be ignored by the allocator.
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
  *
  * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
- * is used to finish the allocation.
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
-static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
@@ -366,7 +366,7 @@ static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
  * This version of it has extra checks so that a block isn't cow'd more than
  * once per transaction, as long as it hasn't been written yet
  */
-int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret, u64 prealloc_dest)
@@ -375,13 +375,16 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)
 		       root->fs_info->running_transaction->transid);
 		WARN_ON(1);
 	}
 	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
 
@@ -489,16 +492,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (cache_only && parent_level != 1)
 		return 0;
 
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->running_transaction->transid);
+	if (trans->transaction != root->fs_info->running_transaction)
 		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
+	if (trans->transid != root->fs_info->generation)
 		WARN_ON(1);
-	}
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -681,51 +678,18 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_bytenr(leaf));
 	}
-#if 0
-	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
-		btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
-		btrfs_item_key(leaf, &leaf_key, i);
-		if (comp_keys(&leaf_key, &cpukey) >= 0) {
-			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad key\n", i);
-			BUG_ON(1);
-		}
-		if (btrfs_item_offset_nr(leaf, i) !=
-			btrfs_item_end_nr(leaf, i + 1)) {
-			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", i);
-			BUG_ON(1);
-		}
-		if (i == 0) {
-			if (btrfs_item_offset_nr(leaf, i) +
-			       btrfs_item_size_nr(leaf, i) !=
-			       BTRFS_LEAF_DATA_SIZE(root)) {
-				btrfs_print_leaf(root, leaf);
-				printk("slot %d first offset bad\n", i);
-				BUG_ON(1);
-			}
-		}
-	}
-	if (nritems > 0) {
-		if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
-				btrfs_print_leaf(root, leaf);
-				printk("slot %d bad size \n", nritems - 1);
-				BUG_ON(1);
-		}
-	}
-#endif
 	if (slot != 0 && slot < nritems - 1) {
 		btrfs_item_key(leaf, &leaf_key, slot);
 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad key\n", slot);
+			printk(KERN_CRIT "slot %d offset bad key\n", slot);
 			BUG_ON(1);
 		}
 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
 		       btrfs_item_end_nr(leaf, slot)) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", slot);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
 			BUG_ON(1);
 		}
 	}
@@ -736,7 +700,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		if (btrfs_item_offset_nr(leaf, slot) !=
 			btrfs_item_end_nr(leaf, slot + 1)) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", slot);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
 			BUG_ON(1);
 		}
 	}
@@ -745,30 +709,10 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	return 0;
 }
 
-static int noinline check_block(struct btrfs_root *root,
+static noinline int check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	u64 found_start;
 	return 0;
-	if (btrfs_header_level(path->nodes[level]) != level)
-	    printk("warning: bad level %Lu wanted %d found %d\n",
-		   path->nodes[level]->start, level,
-		   btrfs_header_level(path->nodes[level]));
-	found_start = btrfs_header_bytenr(path->nodes[level]);
-	if (found_start != path->nodes[level]->start) {
-	    printk("warning: bad bytentr %Lu found %Lu\n",
-		   path->nodes[level]->start, found_start);
-	}
-#if 0
-	struct extent_buffer *buf = path->nodes[level];
-
-	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
-				 (unsigned long)btrfs_header_fsid(buf),
-				 BTRFS_FSID_SIZE)) {
-		printk("warning bad block %Lu\n", buf->start);
-		return 1;
-	}
-#endif
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
@@ -802,7 +746,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 	unsigned long map_len = 0;
 	int err;
 
-	while(low < high) {
+	while (low < high) {
 		mid = (low + high) / 2;
 		offset = p + mid * item_size;
 
@@ -1130,7 +1074,7 @@ enospc:
  * when they are completely full.  This is also done top down, so we
  * have to be pessimistic.
  */
-static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, int level)
 {
@@ -1296,7 +1240,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
-	while(1) {
+	while (1) {
 		if (direction < 0) {
 			if (nr == 0)
 				break;
@@ -1322,7 +1266,8 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		nscan++;
 		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
 			break;
-		if(nread > (256 * 1024) || nscan > 128)
+
+		if (nread > (256 * 1024) || nscan > 128)
 			break;
 
 		if (search < lowest_read)
@@ -1333,17 +1278,17 @@ static noinline void reada_for_search(struct btrfs_root *root,
 }
 
 /*
- * when we walk down the tree, it is usually safe to unlock the higher layers in
- * the tree.  The exceptions are when our path goes through slot 0, because operations
- * on the tree might require changing key pointers higher up in the tree.
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
  *
- * callers might also have set path->keep_locks, which tells this code to
- * keep the lock if the path points to the last slot in the block.  This is
- * part of walking through the tree, and selecting the next slot in the higher
- * block.
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
  *
- * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
- * so if lowest_unlock is 1, level 0 won't be unlocked
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
  */
 static noinline void unlock_up(struct btrfs_path *path, int level,
 			       int lowest_unlock)
@@ -1832,9 +1777,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	if (!empty && src_nritems <= 8)
 		return 1;
 
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
 	if (empty) {
 		push_items = min(src_nritems, push_items);
@@ -1854,7 +1798,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
 			   btrfs_node_key_ptr_offset(0),
-		           push_items * sizeof(struct btrfs_key_ptr));
+			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
@@ -1899,19 +1843,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	src_nritems = btrfs_header_nritems(src);
 	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
-	if (src_nritems < 4) {
+	if (src_nritems < 4)
 		return 1;
-	}
 
 	max_push = src_nritems / 2 + 1;
 	/* don't try to empty the node */
-	if (max_push >= src_nritems) {
+	if (max_push >= src_nritems)
 		return 1;
-	}
 
 	if (max_push < push_items)
 		push_items = max_push;
@@ -1924,7 +1865,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
-		           push_items * sizeof(struct btrfs_key_ptr));
+			   push_items * sizeof(struct btrfs_key_ptr));
 
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
@@ -1945,7 +1886,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
  *
  * returns zero on success or < 0 on failure.
  */
-static int noinline insert_new_root(struct btrfs_trans_handle *trans,
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
@@ -2176,14 +2117,15 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int noinline btrfs_leaf_free_space(struct btrfs_root *root,
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 				   struct extent_buffer *leaf)
 {
 	int nritems = btrfs_header_nritems(leaf);
 	int ret;
 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
 	if (ret < 0) {
-		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
+		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+		       "used %d nritems %d\n",
 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
 		       leaf_space_used(leaf, 0, nritems), nritems);
 	}
@@ -2219,9 +2161,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 
 	slot = path->slots[1];
-	if (!path->nodes[1]) {
+	if (!path->nodes[1])
 		return 1;
-	}
+
 	upper = path->nodes[1];
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
@@ -2418,9 +2360,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 
 	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0) {
+	if (right_nritems == 0)
 		return 1;
-	}
 
 	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
 
@@ -2502,7 +2443,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			   push_items * sizeof(struct btrfs_item));
 
 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
-		     btrfs_item_offset_nr(right, push_items -1);
+		     btrfs_item_offset_nr(right, push_items - 1);
 
 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
 		     leaf_data_end(root, left) - push_space,
@@ -2537,7 +2478,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* fixup right node */
 	if (push_items > right_nritems) {
-		printk("push items %d nr %u\n", push_items, right_nritems);
+		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+		       right_nritems);
 		WARN_ON(1);
 	}
 
@@ -2640,9 +2582,8 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
-		if (wret < 0) {
+		if (wret < 0)
 			return wret;
-		}
 		if (wret) {
 			wret = push_leaf_left(trans, root, path, data_size, 0);
 			if (wret < 0)
@@ -2665,7 +2606,7 @@ again:
 	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
-	mid = (nritems + 1)/ 2;
+	mid = (nritems + 1) / 2;
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 					path->nodes[1]->start,
@@ -2734,7 +2675,7 @@ again:
 				path->slots[0] = 0;
 				if (path->slots[1] == 0) {
 					wret = fixup_low_keys(trans, root,
-					           path, &disk_key, 1);
+						      path, &disk_key, 1);
 					if (wret)
 						ret = wret;
 				}
@@ -3033,8 +2974,8 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			    BTRFS_FILE_EXTENT_INLINE) {
 				ptr = btrfs_item_ptr_offset(leaf, slot);
 				memmove_extent_buffer(leaf, ptr,
-				        (unsigned long)fi,
-				        offsetof(struct btrfs_file_extent_item,
+				      (unsigned long)fi,
+				      offsetof(struct btrfs_file_extent_item,
 						 disk_bytenr));
 			}
 		}
@@ -3096,7 +3037,8 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
 		btrfs_print_leaf(root, leaf);
-		printk("slot %d too large, nritems %d\n", slot, nritems);
+		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		       slot, nritems);
 		BUG_ON(1);
 	}
 
@@ -3218,7 +3160,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d old_data %d data_end %d\n",
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3317,9 +3259,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr; i++)
 		total_data += data_size[i];
-	}
 
 	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
@@ -3336,7 +3277,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
 		btrfs_print_leaf(root, leaf);
-		printk("not enough freespace need %u have %d\n",
+		printk(KERN_CRIT "not enough freespace need %u have %d\n",
 		       total_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
@@ -3349,7 +3290,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d old_data %d data_end %d\n",
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3457,7 +3398,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int wret;
 
 	nritems = btrfs_header_nritems(parent);
-	if (slot != nritems -1) {
+	if (slot != nritems - 1) {
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),
@@ -3614,7 +3555,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path, leaf->start);
+				ret = btrfs_del_leaf(trans, root, path,
+						     leaf->start);
 				BUG_ON(ret);
 				free_extent_buffer(leaf);
 			} else {
@@ -3717,7 +3659,7 @@ again:
 		ret = 1;
 		goto out;
 	}
-	while(1) {
+	while (1) {
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
 		sret = bin_search(cur, min_key, level, &slot);
@@ -3738,7 +3680,7 @@ again:
 		 * min_trans parameters.  If it isn't in cache or is too
 		 * old, skip to the next one.
 		 */
-		while(slot < nritems) {
+		while (slot < nritems) {
 			u64 blockptr;
 			u64 gen;
 			struct extent_buffer *tmp;
@@ -3830,7 +3772,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 	struct extent_buffer *c;
 
 	WARN_ON(!path->keep_locks);
-	while(level < BTRFS_MAX_LEVEL) {
+	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 
@@ -3839,9 +3781,8 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 next:
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL) {
+			if (level == BTRFS_MAX_LEVEL)
 				return 1;
-			}
 			continue;
 		}
 		if (level == 0)
@@ -3889,9 +3830,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	int ret;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
-	if (nritems == 0) {
+	if (nritems == 0)
 		return 1;
-	}
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 
@@ -3915,7 +3855,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		goto done;
 	}
 
-	while(level < BTRFS_MAX_LEVEL) {
+	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 
@@ -3923,9 +3863,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL) {
+			if (level == BTRFS_MAX_LEVEL)
 				return 1;
-			}
 			continue;
 		}
 
@@ -3946,7 +3885,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		break;
 	}
 	path->slots[level] = slot;
-	while(1) {
+	while (1) {
 		level--;
 		c = path->nodes[level];
 		if (path->locks[level])
@@ -3986,7 +3925,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 	u32 nritems;
 	int ret;
 
-	while(1) {
+	while (1) {
 		if (path->slots[0] == 0) {
 			ret = btrfs_prev_leaf(root, path);
 			if (ret != 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ccea0648e10..eee060f8811 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -126,7 +126,6 @@ struct btrfs_ordered_sum;
 static int btrfs_csum_sizes[] = { 4, 0 };
 
 /* four bytes for CRC32 */
-//#define BTRFS_CRC32_SIZE 4
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_FT_UNKNOWN	0
@@ -283,8 +282,8 @@ struct btrfs_header {
 } __attribute__ ((__packed__));
 
 #define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
-			        sizeof(struct btrfs_header)) / \
-			        sizeof(struct btrfs_key_ptr))
+				      sizeof(struct btrfs_header)) / \
+				     sizeof(struct btrfs_key_ptr))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
@@ -1512,7 +1511,7 @@ static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
 
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
 {
-	return (btrfs_header_level(eb) == 0);
+	return btrfs_header_level(eb) == 0;
 }
 
 /* struct btrfs_root_item */
@@ -1597,8 +1596,8 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 
-static inline unsigned long btrfs_file_extent_inline_start(struct
-						   btrfs_file_extent_item *e)
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
 {
 	unsigned long offset = (unsigned long)e;
 	offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
@@ -1660,20 +1659,20 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 				      const char *name, int len)
 {
 	/* if we already have a name just free it */
-	if (root->name)
-		kfree(root->name);
+	kfree(root->name);
 
 	root->name = kmalloc(len+1, GFP_KERNEL);
 	if (!root->name)
 		return -ENOMEM;
 
 	memcpy(root->name, name, len);
-	root->name[len] ='\0';
+	root->name[len] = '\0';
 
 	return 0;
 }
 
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
 	if (level == 0)
 		return root->leafsize;
 	return root->nodesize;
@@ -1707,9 +1706,9 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
-							 btrfs_fs_info *info,
-							 u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr);
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -1908,8 +1907,9 @@ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest_root);
 /* dir-item.c */
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, const char *name,
+			  int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 5040b71f190..926a0b287a7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -333,7 +333,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 	leaf = path->nodes[0];
 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
-	while(cur < total_len) {
+	while (cur < total_len) {
 		this_len = sizeof(*dir_item) +
 			btrfs_dir_name_len(leaf, dir_item) +
 			btrfs_dir_data_len(leaf, dir_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dae25e78a6b..81a313874ae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h> // for block_sync_page
+#include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -40,19 +40,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-#if 0
-static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
-{
-	if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
-		printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
-		       (unsigned long long)extent_buffer_blocknr(buf),
-		       (unsigned long long)btrfs_header_blocknr(buf));
-		return 1;
-	}
-	return 0;
-}
-#endif
-
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
@@ -128,23 +115,13 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 		u64 failed_start = em->start;
 		u64 failed_len = em->len;
 
-		printk("failed to insert %Lu %Lu -> %Lu into tree\n",
-		       em->start, em->len, em->block_start);
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (em) {
-			printk("after failing, found %Lu %Lu %Lu\n",
-			       em->start, em->len, em->block_start);
 			ret = 0;
 		} else {
 			em = lookup_extent_mapping(em_tree, failed_start,
 						   failed_len);
-			if (em) {
-				printk("double failure lookup gives us "
-				       "%Lu %Lu -> %Lu\n", em->start,
-				       em->len, em->block_start);
-				free_extent_map(em);
-			}
 			ret = -EIO;
 		}
 	} else if (ret) {
@@ -191,15 +168,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	unsigned long inline_result;
 
 	len = buf->len - offset;
-	while(len > 0) {
+	while (len > 0) {
 		err = map_private_extent_buffer(buf, offset, 32,
 					&map_token, &kaddr,
 					&map_start, &map_len, KM_USER0);
-		if (err) {
-			printk("failed to map extent buffer! %lu\n",
-			       offset);
+		if (err)
 			return 1;
-		}
 		cur_len = min(len, map_len - (offset - map_start));
 		crc = btrfs_csum_data(root, kaddr + offset - map_start,
 				      crc, cur_len);
@@ -218,15 +192,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
-		/* FIXME, this is not good */
 		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 			u32 val;
 			u32 found = 0;
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk("btrfs: %s checksum verify failed on %llu "
-			       "wanted %X found %X level %d\n",
+			printk(KERN_INFO "btrfs: %s checksum verify failed "
+			       "on %llu wanted %X found %X level %d\n",
 			       root->fs_info->sb->s_id,
 			       buf->start, val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
@@ -293,7 +266,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
-printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -307,9 +280,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
 }
 
 /*
- * checksum a dirty tree block before IO.  This has extra checks to make
- * sure we only fill in the checksum field in the first page of a multi-page block
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
  */
+
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -327,28 +301,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	if (!page->private)
 		goto out;
 	len = page->private >> 2;
-	if (len == 0) {
-		WARN_ON(1);
-	}
+	WARN_ON(len == 0);
+
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 					     btrfs_header_generation(eb));
 	BUG_ON(ret);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
-		       start, found_start, len);
 		WARN_ON(1);
 		goto err;
 	}
 	if (eb->first_page != page) {
-		printk("bad first page %lu %lu\n", eb->first_page->index,
-		       page->index);
 		WARN_ON(1);
 		goto err;
 	}
 	if (!PageUptodate(page)) {
-		printk("csum not up to date page %lu\n", page->index);
 		WARN_ON(1);
 		goto err;
 	}
@@ -396,29 +364,30 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto out;
 	if (!page->private)
 		goto out;
+
 	len = page->private >> 2;
-	if (len == 0) {
-		WARN_ON(1);
-	}
+	WARN_ON(len == 0);
+
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("bad tree block start %llu %llu\n",
+		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
 		       (unsigned long long)found_start,
 		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (eb->first_page != page) {
-		printk("bad first page %lu %lu\n", eb->first_page->index,
-		       page->index);
+		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+		       eb->first_page->index, page->index);
 		WARN_ON(1);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk("bad fsid on block %Lu\n", eb->start);
+		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
@@ -578,7 +547,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			   HZ/10);
 	}
 #endif
-	while(atomic_read(&fs_info->async_submit_draining) &&
+	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) == 0));
@@ -594,7 +563,7 @@ static int btree_csum_one_bio(struct bio *bio)
 	struct btrfs_root *root;
 
 	WARN_ON(bio->bi_vcnt <= 0);
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 		csum_dirty_buffer(root, bvec->bv_page);
 		bio_index++;
@@ -680,9 +649,8 @@ static int btree_writepages(struct address_space *mapping,
 
 		num_dirty = count_range_bits(tree, &start, (u64)-1,
 					     thresh, EXTENT_DIRTY);
-		if (num_dirty < thresh) {
+		if (num_dirty < thresh)
 			return 0;
-		}
 	}
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
@@ -701,15 +669,14 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	int ret;
 
 	if (PageWriteback(page) || PageDirty(page))
-	    return 0;
+		return 0;
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 
 	ret = try_release_extent_state(map, tree, page, gfp_flags);
-	if (!ret) {
+	if (!ret)
 		return 0;
-	}
 
 	ret = try_release_extent_buffer(tree, page);
 	if (ret == 1) {
@@ -728,8 +695,8 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk("warning page private not zero on page %Lu\n",
-		       page_offset(page));
+		printk(KERN_WARNING "btrfs warning page private not zero "
+		       "on page %llu\n", (unsigned long long)page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -813,7 +780,7 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
 	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
-				  buf->start, buf->start + buf->len -1);
+				  buf->start, buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -832,11 +799,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
-	if (ret == 0) {
+	if (ret == 0)
 		buf->flags |= EXTENT_UPTODATE;
-	} else {
+	else
 		WARN_ON(1);
-	}
 	return buf;
 
 }
@@ -944,7 +910,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	if (!log_root_tree)
 		return 0;
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
 				    0, &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -1165,24 +1131,6 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	root->in_sysfs = 1;
 	return root;
 }
-#if 0
-static int add_hasher(struct btrfs_fs_info *info, char *type) {
-	struct btrfs_hasher *hasher;
-
-	hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
-	if (!hasher)
-		return -ENOMEM;
-	hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
-	if (!hasher->hash_tfm) {
-		kfree(hasher);
-		return -EINVAL;
-	}
-	spin_lock(&info->hash_lock);
-	list_add(&hasher->list, &info->hashers);
-	spin_unlock(&info->hash_lock);
-	return 0;
-}
-#endif
 
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
@@ -1226,9 +1174,8 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 			continue;
 
 		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn) {
+		if (bdi->unplug_io_fn)
 			bdi->unplug_io_fn(bdi, page);
-		}
 	}
 }
 
@@ -1420,8 +1367,9 @@ static int transaction_kthread(void *arg)
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-			printk("btrfs: total reference cache size %Lu\n",
-				root->fs_info->total_ref_cache_size);
+			printk(KERN_INFO "btrfs: total reference cache "
+			       "size %llu\n",
+			       root->fs_info->total_ref_cache_size);
 		}
 
 		mutex_lock(&root->fs_info->trans_mutex);
@@ -1592,14 +1540,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->tree_log_writers, 0);
 	fs_info->tree_log_transid = 0;
 
-#if 0
-	ret = add_hasher(fs_info, "crc32c");
-	if (ret) {
-		printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
-		err = -ENOMEM;
-		goto fail_iput;
-	}
-#endif
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -1720,7 +1660,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
-		printk("btrfs: valid FS not found on %s\n", sb->s_id);
+		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
 
@@ -1728,8 +1668,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk("btrfs: failed to read the system array on %s\n",
-		       sb->s_id);
+		printk(KERN_WARNING "btrfs: failed to read the system "
+		       "array on %s\n", sb->s_id);
 		goto fail_sys_array;
 	}
 
@@ -1746,14 +1686,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	BUG_ON(!chunk_root->node);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-		 BTRFS_UUID_SIZE);
+	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+	   BTRFS_UUID_SIZE);
 
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk("btrfs: failed to read chunk tree on %s\n", sb->s_id);
+		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		       sb->s_id);
 		goto fail_chunk_root;
 	}
 
@@ -1812,7 +1753,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
-			printk("Btrfs log replay required on RO media\n");
+			printk(KERN_WARNING "Btrfs log replay required "
+			       "on RO media\n");
 			err = -EIO;
 			goto fail_trans_kthread;
 		}
@@ -2097,7 +2039,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk("btrfs: %d errors while writing supers\n", total_errors);
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
 		BUG();
 	}
 
@@ -2114,7 +2057,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk("btrfs: %d errors while writing supers\n", total_errors);
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
 		BUG();
 	}
 	return 0;
@@ -2137,16 +2081,11 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 		down_write(&root->anon_super.s_umount);
 		kill_anon_super(&root->anon_super);
 	}
-#if 0
-	if (root->in_sysfs)
-		btrfs_sysfs_del_root(root);
-#endif
 	if (root->node)
 		free_extent_buffer(root->node);
 	if (root->commit_root)
 		free_extent_buffer(root->commit_root);
-	if (root->name)
-		kfree(root->name);
+	kfree(root->name);
 	kfree(root);
 	return 0;
 }
@@ -2157,7 +2096,7 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 	struct btrfs_root *gang[8];
 	int i;
 
-	while(1) {
+	while (1) {
 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
 					     (void **)gang, 0,
 					     ARRAY_SIZE(gang));
@@ -2228,18 +2167,17 @@ int close_ctree(struct btrfs_root *root)
 
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret =  btrfs_commit_super(root);
-		if (ret) {
-			printk("btrfs: commit super returns %d\n", ret);
-		}
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
 	if (fs_info->delalloc_bytes) {
-		printk("btrfs: at unmount delalloc count %Lu\n",
+		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       fs_info->delalloc_bytes);
 	}
 	if (fs_info->total_ref_cache_size) {
-		printk("btrfs: at umount reference cache size %Lu\n",
-			fs_info->total_ref_cache_size);
+		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+		       (unsigned long long)fs_info->total_ref_cache_size);
 	}
 
 	if (fs_info->extent_root->node)
@@ -2248,13 +2186,13 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
 
-	if (root->fs_info->chunk_root->node);
+	if (root->fs_info->chunk_root->node)
 		free_extent_buffer(root->fs_info->chunk_root->node);
 
-	if (root->fs_info->dev_root->node);
+	if (root->fs_info->dev_root->node)
 		free_extent_buffer(root->fs_info->dev_root->node);
 
-	if (root->fs_info->csum_root->node);
+	if (root->fs_info->csum_root->node)
 		free_extent_buffer(root->fs_info->csum_root->node);
 
 	btrfs_free_block_groups(root->fs_info);
@@ -2273,7 +2211,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 #if 0
-	while(!list_empty(&fs_info->hashers)) {
+	while (!list_empty(&fs_info->hashers)) {
 		struct btrfs_hasher *hasher;
 		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
 				    hashers);
@@ -2324,9 +2262,11 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
-		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
+		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+		       "found %llu running %llu\n",
 			(unsigned long long)buf->start,
-			transid, root->fs_info->generation);
+			(unsigned long long)transid,
+			(unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
@@ -2361,9 +2301,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	int ret;
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-	if (ret == 0) {
+	if (ret == 0)
 		buf->flags |= EXTENT_UPTODATE;
-	}
 	return ret;
 }
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 48b82cd7583..85315d2c90d 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,9 +7,11 @@
 #include "export.h"
 #include "compat.h"
 
-#define BTRFS_FID_SIZE_NON_CONNECTABLE		(offsetof(struct btrfs_fid, parent_objectid)/4)
-#define BTRFS_FID_SIZE_CONNECTABLE		(offsetof(struct btrfs_fid, parent_root_objectid)/4)
-#define BTRFS_FID_SIZE_CONNECTABLE_ROOT		(sizeof(struct btrfs_fid)/4)
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+						 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+					     parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
 
 static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 			   int connectable)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 780c1eeb829..ec43fa526d7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,10 +49,10 @@ struct pending_extent_op {
 	int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans, struct
-				 btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root, int all);
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all);
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data);
@@ -247,7 +247,7 @@ static int cache_block_group(struct btrfs_root *root,
 	if (ret < 0)
 		goto err;
 
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -292,9 +292,8 @@ err:
 /*
  * return the block group that starts at or after bytenr
  */
-static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
-						       btrfs_fs_info *info,
-							 u64 bytenr)
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 {
 	struct btrfs_block_group_cache *cache;
 
@@ -306,9 +305,9 @@ static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 /*
  * return the block group that contains teh given bytenr
  */
-struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
-							 btrfs_fs_info *info,
-							 u64 bytenr)
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr)
 {
 	struct btrfs_block_group_cache *cache;
 
@@ -492,7 +491,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  * to the key objectid.
  */
 
-static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
@@ -537,7 +536,7 @@ out:
  * updates all the backrefs that are pending on update_list for the
  * extent_root
  */
-static int noinline update_backrefs(struct btrfs_trans_handle *trans,
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_path *path,
 				    struct list_head *update_list)
@@ -573,9 +572,11 @@ loop:
 	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
 	    (ref_objectid != op->level &&
 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
-		       "owner %u\n", op->bytenr, op->orig_parent,
-		       ref_root, op->level);
+		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+		       "root %llu, owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)op->orig_parent,
+		       (unsigned long long)ref_root, op->level);
 		btrfs_print_leaf(extent_root, leaf);
 		BUG();
 	}
@@ -620,7 +621,7 @@ out:
 	return 0;
 }
 
-static int noinline insert_extents(struct btrfs_trans_handle *trans,
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *extent_root,
 				   struct btrfs_path *path,
 				   struct list_head *insert_list, int nr)
@@ -781,7 +782,7 @@ static int noinline insert_extents(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
@@ -840,7 +841,7 @@ out:
 	return ret;
 }
 
-static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path)
 {
@@ -868,7 +869,7 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
 #else
 	blkdev_issue_discard(bdev, start >> 9, len >> 9);
@@ -908,7 +909,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static int noinline free_extents(struct btrfs_trans_handle *trans,
+static noinline int free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
 				 struct list_head *del_list)
 {
@@ -937,10 +938,11 @@ search:
 				    extent_root->root_key.objectid,
 				    op->orig_generation, op->level, 1);
 	if (ret) {
-		printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
-		       "owner %u\n", op->bytenr,
-		       extent_root->root_key.objectid, op->orig_generation,
-		       op->level);
+		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+		       "root %llu gen %llu owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)extent_root->root_key.objectid,
+		       (unsigned long long)op->orig_generation, op->level);
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		goto out;
@@ -1282,7 +1284,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 	if (key.objectid != bytenr) {
 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
-		printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
+		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)key.objectid);
 		BUG();
 	}
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
@@ -1353,7 +1357,8 @@ int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("failed to find block number %Lu\n", bytenr);
+		printk(KERN_INFO "btrfs failed to find block number %llu\n",
+		       (unsigned long long)bytenr);
 		BUG();
 	}
 	l = path->nodes[0];
@@ -1738,7 +1743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	while(1) {
+	while (1) {
 		cache = NULL;
 		spin_lock(&root->fs_info->block_group_cache_lock);
 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
@@ -1921,10 +1926,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	spin_unlock(&space_info->lock);
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
-	if (ret) {
-printk("space info full %Lu\n", flags);
+	if (ret)
 		space_info->full = 1;
-	}
 out:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
@@ -1941,7 +1944,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	while(total) {
+	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
 			return -1;
@@ -2089,7 +2092,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	int ret;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -2110,7 +2113,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
@@ -2400,7 +2403,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
-		while(extent_slot > 0) {
+		while (extent_slot > 0) {
 			extent_slot--;
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 					      extent_slot);
@@ -2422,8 +2425,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 						&key, path, -1, 1);
 			if (ret) {
 				printk(KERN_ERR "umm, got %d back from search"
-				       ", was looking for %Lu\n", ret,
-				       bytenr);
+				       ", was looking for %llu\n", ret,
+				       (unsigned long long)bytenr);
 				btrfs_print_leaf(extent_root, path->nodes[0]);
 			}
 			BUG_ON(ret);
@@ -2432,9 +2435,12 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	} else {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
-		printk("Unable to find ref byte nr %Lu root %Lu "
-		       "gen %Lu owner %Lu\n", bytenr,
-		       root_objectid, ref_generation, owner_objectid);
+		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+		       "root %llu gen %llu owner %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)root_objectid,
+		       (unsigned long long)ref_generation,
+		       (unsigned long long)owner_objectid);
 	}
 
 	leaf = path->nodes[0];
@@ -2517,8 +2523,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
  */
-static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root, int all)
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all)
 {
 	int ret;
 	int err = 0;
@@ -2539,7 +2545,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 
 again:
 	mutex_lock(&info->extent_ins_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(pending_del, search, &start, &end,
 					    EXTENT_WRITEBACK);
 		if (ret) {
@@ -2753,7 +2759,7 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static int noinline find_free_extent(struct btrfs_trans_handle *trans,
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 search_start, u64 search_end,
@@ -2762,7 +2768,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     int data)
 {
 	int ret = 0;
-	struct btrfs_root * root = orig_root->fs_info->extent_root;
+	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
 	u64 last_wanted = 0;
@@ -2995,8 +3001,10 @@ loop_check:
 			*last_ptr = ins->objectid + ins->offset;
 		ret = 0;
 	} else if (!ret) {
-		printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
-		       " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
+		printk(KERN_ERR "btrfs searching for %llu bytes, "
+		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
+		       (unsigned long long)total_needed,
+		       (unsigned long long)num_bytes,
 		       loop, allowed_chunk_alloc);
 		ret = -ENOSPC;
 	}
@@ -3012,19 +3020,22 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	struct btrfs_block_group_cache *cache;
 	struct list_head *l;
 
-	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
-	       info->total_bytes - info->bytes_used - info->bytes_pinned -
-	       info->bytes_reserved, (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	       (unsigned long long)(info->total_bytes - info->bytes_used -
+				    info->bytes_pinned - info->bytes_reserved),
+	       (info->full) ? "" : "not ");
 
 	down_read(&info->groups_sem);
 	list_for_each(l, &info->block_groups) {
 		cache = list_entry(l, struct btrfs_block_group_cache, list);
 		spin_lock(&cache->lock);
-		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
-		       "%Lu pinned %Lu reserved\n",
-		       cache->key.objectid, cache->key.offset,
-		       btrfs_block_group_used(&cache->item),
-		       cache->pinned, cache->reserved);
+		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+		       "%llu pinned %llu reserved\n",
+		       (unsigned long long)cache->key.objectid,
+		       (unsigned long long)cache->key.offset,
+		       (unsigned long long)btrfs_block_group_used(&cache->item),
+		       (unsigned long long)cache->pinned,
+		       (unsigned long long)cache->reserved);
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
@@ -3045,15 +3056,15 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
-			        info->data_alloc_profile;
+			info->data_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
 	} else if (root == root->fs_info->chunk_root) {
 		alloc_profile = info->avail_system_alloc_bits &
-			        info->system_alloc_profile;
+			info->system_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
 	} else {
 		alloc_profile = info->avail_metadata_alloc_bits &
-			        info->metadata_alloc_profile;
+			info->metadata_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
@@ -3092,8 +3103,9 @@ again:
 		struct btrfs_space_info *sinfo;
 
 		sinfo = __find_space_info(root->fs_info, data);
-		printk("allocation failed flags %Lu, wanted %Lu\n",
-		       data, num_bytes);
+		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+		       "wanted %llu\n", (unsigned long long)data,
+		       (unsigned long long)num_bytes);
 		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
@@ -3108,7 +3120,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
+		printk(KERN_ERR "Unable to find block group for %llu\n",
+		       (unsigned long long)start);
 		return -ENOSPC;
 	}
 
@@ -3235,10 +3248,12 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	}
 
 update_block:
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
+	ret = update_block_group(trans, root, ins->objectid,
+				 ins->offset, 1, 0);
 	if (ret) {
-		printk("update block group failed for %Lu %Lu\n",
-		       ins->objectid, ins->offset);
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
 		BUG();
 	}
 out:
@@ -3420,7 +3435,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_leaf_ref *ref)
 {
@@ -3445,15 +3460,15 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
-			      u32 *refs)
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+				     u64 len, u32 *refs)
 {
 	int ret;
 
 	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
 	BUG_ON(ret);
 
-#if 0 // some debugging code in case we see problems here
+#if 0 /* some debugging code in case we see problems here */
 	/* if the refs count is one, it won't get increased again.  But
 	 * if the ref count is > 1, someone may be decreasing it at
 	 * the same time we are.
@@ -3474,8 +3489,8 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len
 			free_extent_buffer(eb);
 		}
 		if (*refs == 1) {
-			printk("block %llu went down to one during drop_snap\n",
-			       (unsigned long long)start);
+			printk(KERN_ERR "btrfs block %llu went down to one "
+			       "during drop_snap\n", (unsigned long long)start);
 		}
 
 	}
@@ -3489,7 +3504,7 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
-static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level)
 {
@@ -3516,7 +3531,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	/*
 	 * walk down to the last node level and free all the leaves
 	 */
-	while(*level >= 0) {
+	while (*level >= 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -3576,10 +3591,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				*level = 0;
 				break;
 			}
-			if (printk_ratelimit()) {
-				printk("leaf ref miss for bytenr %llu\n",
-				       (unsigned long long)bytenr);
-			}
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
@@ -3641,7 +3652,7 @@ out:
  * walk_down_tree. The main difference is that it checks reference
  * counts while tree blocks are locked.
  */
-static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path, int *level)
 {
@@ -3730,7 +3741,7 @@ out:
  * to find the first node higher up where we haven't yet gone through
  * all the slots
  */
-static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 int *level, int max_level)
@@ -3839,7 +3850,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			}
 		}
 	}
-	while(1) {
+	while (1) {
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -3920,7 +3931,7 @@ static unsigned long calc_ra(unsigned long start, unsigned long last,
 	return min(last, start + nr - 1);
 }
 
-static int noinline relocate_inode_pages(struct inode *inode, u64 start,
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
 					 u64 len)
 {
 	u64 page_start;
@@ -4011,7 +4022,7 @@ out_unlock:
 	return ret;
 }
 
-static int noinline relocate_data_extent(struct inode *reloc_inode,
+static noinline int relocate_data_extent(struct inode *reloc_inode,
 					 struct btrfs_key *extent_key,
 					 u64 offset)
 {
@@ -4087,7 +4098,7 @@ static int is_cowonly_root(u64 root_objectid)
 	return 0;
 }
 
-static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_ref_path *ref_path,
 				    int first_time)
@@ -4119,11 +4130,10 @@ walk_down:
 		if (level < ref_path->lowest_level)
 			break;
 
-		if (level >= 0) {
+		if (level >= 0)
 			bytenr = ref_path->nodes[level];
-		} else {
+		else
 			bytenr = ref_path->extent_start;
-		}
 		BUG_ON(bytenr == 0);
 
 		parent = ref_path->nodes[level + 1];
@@ -4170,11 +4180,12 @@ walk_up:
 	level = ref_path->current_level;
 	while (level < BTRFS_MAX_LEVEL - 1) {
 		u64 ref_objectid;
-		if (level >= 0) {
+
+		if (level >= 0)
 			bytenr = ref_path->nodes[level];
-		} else {
+		else
 			bytenr = ref_path->extent_start;
-		}
+
 		BUG_ON(bytenr == 0);
 
 		key.objectid = bytenr;
@@ -4299,7 +4310,7 @@ static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
 	return __next_ref_path(trans, extent_root, ref_path, 0);
 }
 
-static int noinline get_new_locations(struct inode *reloc_inode,
+static noinline int get_new_locations(struct inode *reloc_inode,
 				      struct btrfs_key *extent_key,
 				      u64 offset, int no_fragment,
 				      struct disk_extent **extents,
@@ -4420,7 +4431,7 @@ out:
 	return ret;
 }
 
-static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key,
@@ -4778,7 +4789,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline invalidate_extent_cache(struct btrfs_root *root,
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
 					struct extent_buffer *leaf,
 					struct btrfs_block_group_cache *group,
 					struct btrfs_root *target_root)
@@ -4826,7 +4837,7 @@ static int noinline invalidate_extent_cache(struct btrfs_root *root,
 	return 0;
 }
 
-static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct extent_buffer *leaf,
 					struct btrfs_block_group_cache *group,
@@ -5035,7 +5046,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
 	return 0;
 }
 
-static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
@@ -5102,7 +5113,7 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
  * tree blocks are shared between reloc trees, so they are also shared
  * between subvols.
  */
-static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path,
 				      struct btrfs_key *first_key,
@@ -5199,7 +5210,7 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct btrfs_key *first_key,
@@ -5217,7 +5228,7 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_path *path,
 				    struct btrfs_key *extent_key)
@@ -5233,7 +5244,7 @@ out:
 	return ret;
 }
 
-static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
 						struct btrfs_ref_path *ref_path)
 {
 	struct btrfs_key root_key;
@@ -5248,7 +5259,7 @@ static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
 }
 
-static int noinline relocate_one_extent(struct btrfs_root *extent_root,
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key,
 					struct btrfs_block_group_cache *group,
@@ -5276,8 +5287,8 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 
 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
 	if (!ref_path) {
-	       ret = -ENOMEM;
-	       goto out;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	for (loops = 0; ; loops++) {
@@ -5497,7 +5508,7 @@ out:
 	return ret;
 }
 
-static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 					struct btrfs_block_group_cache *group)
 {
 	struct inode *inode = NULL;
@@ -5617,7 +5628,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	block_group = btrfs_lookup_block_group(info, group_start);
 	BUG_ON(!block_group);
 
-	printk("btrfs relocating block group %llu flags %llu\n",
+	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
 	       (unsigned long long)block_group->key.objectid,
 	       (unsigned long long)block_group->flags);
 
@@ -5649,7 +5660,7 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
@@ -5712,7 +5723,7 @@ next:
 	}
 
 	if (total_found > 0) {
-		printk("btrfs found %llu extents in pass %d\n",
+		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
 		       (unsigned long long)total_found, pass);
 		pass++;
 		if (total_found == skipped && pass > 2) {
@@ -5754,7 +5765,7 @@ static int find_first_block_group(struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 
-	while(1) {
+	while (1) {
 		slot = path->slots[0];
 		leaf = path->nodes[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -5825,7 +5836,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	while(1) {
+	while (1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
 			ret = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0bf7684207a..39edb551dca 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -32,7 +32,7 @@ static LIST_HEAD(states);
 
 #define LEAK_DEBUG 0
 #ifdef LEAK_DEBUG
-static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(leak_lock);
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -81,7 +81,11 @@ void extent_io_exit(void)
 
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, leak_list);
-		printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       (unsigned long long)state->start,
+		       (unsigned long long)state->end,
+		       state->state, state->tree, atomic_read(&state->refs));
 		list_del(&state->leak_list);
 		kmem_cache_free(extent_state_cache, state);
 
@@ -89,7 +93,9 @@ void extent_io_exit(void)
 
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n", (unsigned long long)eb->start,
+		       eb->len, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
@@ -158,11 +164,11 @@ EXPORT_SYMBOL(free_extent_state);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
 
@@ -185,13 +191,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 				     struct rb_node **next_ret)
 {
 	struct rb_root *root = &tree->state;
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct tree_entry, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -200,14 +206,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 			n = n->rb_left;
 		else if (offset > entry->end)
 			n = n->rb_right;
-		else {
+		else
 			return n;
-		}
 	}
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset > prev_entry->end) {
+		while (prev && offset > prev_entry->end) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
@@ -217,7 +222,7 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
-		while(prev && offset < prev_entry->start) {
+		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
@@ -233,9 +238,8 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 	struct rb_node *ret;
 
 	ret = __etree_search(tree, offset, &prev, NULL);
-	if (!ret) {
+	if (!ret)
 		return prev;
-	}
 	return ret;
 }
 
@@ -243,11 +247,11 @@ static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
 					  u64 offset, struct rb_node *node)
 {
 	struct rb_root *root = &tree->buffer;
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct extent_buffer *eb;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		eb = rb_entry(parent, struct extent_buffer, rb_node);
 
@@ -268,10 +272,10 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
 					   u64 offset)
 {
 	struct rb_root *root = &tree->buffer;
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct extent_buffer *eb;
 
-	while(n) {
+	while (n) {
 		eb = rb_entry(n, struct extent_buffer, rb_node);
 		if (offset < eb->start)
 			n = n->rb_left;
@@ -363,7 +367,9 @@ static int insert_state(struct extent_io_tree *tree,
 	struct rb_node *node;
 
 	if (end < start) {
-		printk("end < start %Lu %Lu\n", end, start);
+		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+		       (unsigned long long)end,
+		       (unsigned long long)start);
 		WARN_ON(1);
 	}
 	if (bits & EXTENT_DIRTY)
@@ -376,7 +382,10 @@ static int insert_state(struct extent_io_tree *tree,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+		       "%llu %llu\n", (unsigned long long)found->start,
+		       (unsigned long long)found->end,
+		       (unsigned long long)start, (unsigned long long)end);
 		free_extent_state(state);
 		return -EEXIST;
 	}
@@ -412,7 +421,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
@@ -661,8 +669,9 @@ static void set_state_bits(struct extent_io_tree *tree,
  * [start, end] is inclusive
  * This takes the tree lock.
  */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
-		   int exclusive, u64 *failed_start, gfp_t mask)
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			  int bits, int exclusive, u64 *failed_start,
+			  gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -763,7 +772,7 @@ again:
 		if (end < last_start)
 			this_end = end;
 		else
-			this_end = last_start -1;
+			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
 				   bits);
 		prealloc = NULL;
@@ -891,8 +900,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_uptodate);
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			  gfp_t mask)
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+				 u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
 }
@@ -904,8 +913,8 @@ static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			      0, NULL, mask);
 }
 
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
-			   gfp_t mask)
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
 }
@@ -1025,11 +1034,10 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->end >= start && (state->state & bits)) {
 			*start_ret = state->start;
@@ -1062,15 +1070,14 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits)) {
+		if (state->end >= start && (state->state & bits))
 			return state;
-		}
+
 		node = rb_next(node);
 		if (!node)
 			break;
@@ -1108,7 +1115,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 		goto out;
 	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (found && (state->start != cur_start ||
 			      (state->state & EXTENT_BOUNDARY))) {
@@ -1150,7 +1157,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 	if (index == locked_page->index && end_index == index)
 		return 0;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long, nr_pages,
 				     ARRAY_SIZE(pages)), pages);
@@ -1186,7 +1193,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 
 	/* skip the page at the start index */
 	nrpages = end_index - index + 1;
-	while(nrpages > 0) {
+	while (nrpages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nrpages, ARRAY_SIZE(pages)), pages);
@@ -1263,17 +1270,16 @@ again:
 	 * pages in order, so we can't process delalloc bytes before
 	 * locked_page
 	 */
-	if (delalloc_start < *start) {
+	if (delalloc_start < *start)
 		delalloc_start = *start;
-	}
 
 	/*
 	 * make sure to limit the number of pages we try to lock down
 	 * if we're looping.
 	 */
-	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
-	}
+
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
 				  delalloc_start, delalloc_end);
@@ -1341,7 +1347,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
 		return 0;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
@@ -1384,7 +1390,6 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	int found = 0;
 
 	if (search_end <= cur_start) {
-		printk("search_end %Lu start %Lu\n", search_end, cur_start);
 		WARN_ON(1);
 		return 0;
 	}
@@ -1399,11 +1404,10 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, cur_start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > search_end)
 			break;
@@ -1927,19 +1931,15 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		nr = bio_get_nr_vecs(bdev);
 
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
-	if (!bio) {
-		printk("failed to allocate bio nr %d\n", nr);
-	}
 
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 
-	if (bio_ret) {
+	if (bio_ret)
 		*bio_ret = bio;
-	} else {
+	else
 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
-	}
 
 	return ret;
 }
@@ -2028,13 +2028,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			break;
 		}
 		extent_offset = cur - em->start;
-		if (extent_map_end(em) <= cur) {
-printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
-		}
 		BUG_ON(extent_map_end(em) <= cur);
-		if (end < cur) {
-printk("2bad mapping end %Lu cur %Lu\n", end, cur);
-		}
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
@@ -2199,7 +2193,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
-		while(delalloc_end < page_end) {
+		while (delalloc_end < page_end) {
 			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
 						       &delalloc_start,
@@ -2242,9 +2236,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	nr_written++;
 
 	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-		printk("found delalloc bits after lock_extent\n");
-	}
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
@@ -2297,7 +2290,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
 
-			unlock_extent(tree, unlock_start, cur + iosize -1,
+			unlock_extent(tree, unlock_start, cur + iosize - 1,
 				      GFP_NOFS);
 
 			/*
@@ -2344,9 +2337,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
-				printk("warning page %lu not writeback, "
-				       "cur %llu end %llu\n", page->index,
-				       (unsigned long long)cur,
+				printk(KERN_ERR "btrfs warning page %lu not "
+				       "writeback, cur %llu end %llu\n",
+				       page->index, (unsigned long long)cur,
 				       (unsigned long long)end);
 			}
 
@@ -2430,8 +2423,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 retry:
 	while (!done && (index <= end) &&
 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+			      PAGECACHE_TAG_DIRTY, min(end - index,
+				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
@@ -2536,9 +2529,8 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
-	if (epd.bio) {
+	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio, 0, 0);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_full_page);
@@ -2568,7 +2560,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.range_end	= end + 1,
 	};
 
-	while(start <= end) {
+	while (start <= end) {
 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
 		if (clear_page_dirty_for_io(page))
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
@@ -2606,9 +2598,8 @@ int extent_writepages(struct extent_io_tree *tree,
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
-	if (epd.bio) {
+	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio, 0, 0);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_writepages);
@@ -2666,7 +2657,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-	start += (offset + blocksize -1) & ~(blocksize - 1);
+	start += (offset + blocksize - 1) & ~(blocksize - 1);
 	if (start > end)
 		return 0;
 
@@ -2727,12 +2718,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
 	orig_block_start = block_start;
 
 	lock_extent(tree, page_start, page_end, GFP_NOFS);
-	while(block_start <= block_end) {
+	while (block_start <= block_end) {
 		em = get_extent(inode, page, page_offset, block_start,
 				block_end - block_start + 1, 1);
-		if (IS_ERR(em) || !em) {
+		if (IS_ERR(em) || !em)
 			goto err;
-		}
+
 		cur_end = min(block_end, extent_map_end(em) - 1);
 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
 		block_off_end = block_off_start + blocksize;
@@ -3170,7 +3161,7 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
 		set_extent_dirty(tree, page_offset(page),
-				 page_offset(page) + PAGE_CACHE_SIZE -1,
+				 page_offset(page) + PAGE_CACHE_SIZE - 1,
 				 GFP_NOFS);
 		unlock_page(page);
 	}
@@ -3235,7 +3226,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
 	if (ret)
 		return 1;
-	while(start <= end) {
+	while (start <= end) {
 		index = start >> PAGE_CACHE_SHIFT;
 		page = find_get_page(tree->mapping, index);
 		uptodate = PageUptodate(page);
@@ -3321,16 +3312,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			lock_page(page);
 		}
 		locked_pages++;
-		if (!PageUptodate(page)) {
+		if (!PageUptodate(page))
 			all_uptodate = 0;
-		}
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
 			eb->flags |= EXTENT_UPTODATE;
-		if (ret) {
-			printk("all up to date but ret is %d\n", ret);
-		}
 		goto unlock_exit;
 	}
 
@@ -3345,10 +3332,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
 						      mirror_num, &bio_flags);
-			if (err) {
+			if (err)
 				ret = err;
-				printk("err %d from __extent_read_full_page\n", ret);
-			}
 		} else {
 			unlock_page(page);
 		}
@@ -3357,26 +3342,23 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-	if (ret || !wait) {
-		if (ret)
-			printk("ret %d wait %d returning\n", ret, wait);
+	if (ret || !wait)
 		return ret;
-	}
+
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
-		if (!PageUptodate(page)) {
-			printk("page not uptodate after wait_on_page_locked\n");
+		if (!PageUptodate(page))
 			ret = -EIO;
-		}
 	}
+
 	if (!ret)
 		eb->flags |= EXTENT_UPTODATE;
 	return ret;
 
 unlock_exit:
 	i = start_i;
-	while(locked_pages > 0) {
+	while (locked_pages > 0) {
 		page = extent_buffer_page(eb, i);
 		i++;
 		unlock_page(page);
@@ -3403,7 +3385,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -3442,8 +3424,11 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		offset = 0;
 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
 	}
+
 	if (start + min_len > eb->len) {
-printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
+		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n", (unsigned long long)eb->start,
+		       eb->len, start, min_len);
 		WARN_ON(1);
 	}
 
@@ -3506,7 +3491,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -3542,7 +3527,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3574,7 +3559,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3607,7 +3592,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	offset = (start_offset + dst_offset) &
 		((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(dst, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3674,17 +3659,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 
-	while(len > 0) {
+	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
@@ -3722,20 +3707,20 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset < src_offset) {
 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
 		return;
 	}
-	while(len > 0) {
+	while (len > 0) {
 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fd3ebfb8c3c..4a83e33ada3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -89,11 +89,11 @@ EXPORT_SYMBOL(free_extent_map);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct extent_map *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
@@ -122,13 +122,13 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct extent_map *entry;
 	struct extent_map *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct extent_map, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -145,7 +145,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset >= extent_map_end(prev_entry)) {
+		while (prev && offset >= extent_map_end(prev_entry)) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
@@ -155,7 +155,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct extent_map, rb_node);
-		while(prev && offset < prev_entry->start) {
+		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index cc6e0b6de94..b11abfad81a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,7 +24,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r,size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@@ -166,7 +166,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
@@ -192,8 +192,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 						offset + bvec->bv_len - 1,
 						EXTENT_NODATASUM, GFP_NOFS);
 				} else {
-					printk("no csum found for inode %lu "
-					       "start %llu\n", inode->i_ino,
+					printk(KERN_INFO "btrfs no csum found "
+					       "for inode %lu start %llu\n",
+					       inode->i_ino,
 					       (unsigned long long)offset);
 				}
 				item = NULL;
@@ -373,7 +374,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	BUG_ON(!ordered);
 	sums->bytenr = ordered->start;
 
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		if (!contig)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
@@ -507,7 +508,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 
-	while(1) {
+	while (1) {
 		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 		key.offset = end_byte - 1;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -715,9 +716,8 @@ again:
 			goto csum;
 
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size) {
+		if (diff != csum_size)
 			goto insert;
-		}
 
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
@@ -732,7 +732,7 @@ insert:
 		u64 next_sector = sector_sum->bytenr;
 		struct btrfs_sector_sum *next = sector_sum + 1;
 
-		while(tmp < sums->len) {
+		while (tmp < sums->len) {
 			if (next_sector + root->sectorsize != next->bytenr)
 				break;
 			tmp += root->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5908521922f..0e3a13a4565 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -44,10 +44,10 @@
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
-static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 					 int write_bytes,
 					 struct page **prepared_pages,
-					 const char __user * buf)
+					 const char __user *buf)
 {
 	long page_fault = 0;
 	int i;
@@ -78,7 +78,7 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -103,7 +103,7 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
  * this also makes the decision about creating an inline extent vs
  * doing real data extents, marking pages dirty and delalloc as required.
  */
-static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
 				   struct page **pages,
@@ -137,9 +137,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	hint_byte = 0;
 
-	if ((end_of_last_block & 4095) == 0) {
-		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
-	}
 	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
 	/* check for reserved extents on each page, we don't want
@@ -185,7 +182,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		len = (u64)-1;
 		testend = 0;
 	}
-	while(1) {
+	while (1) {
 		if (!split)
 			split = alloc_extent_map(GFP_NOFS);
 		if (!split2)
@@ -295,7 +292,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 	path = btrfs_alloc_path();
 	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
 				       last_offset, 0);
-	while(1) {
+	while (1) {
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(root, path);
@@ -314,8 +311,10 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		if (found_key.offset < last_offset) {
 			WARN_ON(1);
 			btrfs_print_leaf(root, leaf);
-			printk("inode %lu found offset %Lu expected %Lu\n",
-			       inode->i_ino, found_key.offset, last_offset);
+			printk(KERN_ERR "inode %lu found offset %llu "
+			       "expected %llu\n", inode->i_ino,
+			       (unsigned long long)found_key.offset,
+			       (unsigned long long)last_offset);
 			err = 1;
 			goto out;
 		}
@@ -331,7 +330,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 			extent_end = found_key.offset +
 			     btrfs_file_extent_inline_len(leaf, extent);
 			extent_end = (extent_end + root->sectorsize - 1) &
-				~((u64)root->sectorsize -1 );
+				~((u64)root->sectorsize - 1);
 		}
 		last_offset = extent_end;
 		path->slots[0]++;
@@ -339,8 +338,9 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 	if (0 && last_offset < inode->i_size) {
 		WARN_ON(1);
 		btrfs_print_leaf(root, leaf);
-		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
-		       last_offset, inode->i_size);
+		printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+		       inode->i_ino, (unsigned long long)last_offset,
+		       (unsigned long long)inode->i_size);
 		err = 1;
 
 	}
@@ -362,7 +362,7 @@ out:
  * inline_limit is used to tell this code which offsets in the file to keep
  * if they contain inline extents.
  */
-int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
@@ -398,7 +398,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	while(1) {
+	while (1) {
 		recow = 0;
 		btrfs_release_path(root, path);
 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
@@ -649,16 +649,15 @@ next_slot:
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
 						disk_bytenr, orig_parent,
-					        leaf->start,
+						leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
 
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
-			if (disk_bytenr != 0) {
+			if (disk_bytenr != 0)
 				inode_add_bytes(inode, extent_end - end);
-			}
 		}
 
 		if (found_extent && !keep) {
@@ -944,7 +943,7 @@ done:
  * waits for data=ordered extents to finish before allowing the pages to be
  * modified.
  */
-static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
@@ -979,7 +978,8 @@ again:
 		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    last_pos - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset < last_pos) {
@@ -1085,7 +1085,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		}
 	}
 
-	while(count > 0) {
+	while (count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(count, nrptrs *
 					(size_t)PAGE_CACHE_SIZE -
@@ -1178,7 +1178,7 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
-int btrfs_release_file(struct inode * inode, struct file * filp)
+int btrfs_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
@@ -1237,9 +1237,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-	if (ret < 0) {
+	if (ret < 0)
 		goto out;
-	}
 
 	/* we've logged all the items and now have a consistent
 	 * version of the file in the log.  It is possible that
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2e69b9c3043..d1e5f0e84c5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,10 +213,13 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		info->offset = offset;
 		info->bytes += bytes;
 	} else if (right_info && right_info->offset != offset+bytes) {
-		printk(KERN_ERR "adding space in the middle of an existing "
-		       "free space area. existing: offset=%Lu, bytes=%Lu. "
-		       "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
-		       right_info->bytes, offset, bytes);
+		printk(KERN_ERR "btrfs adding space in the middle of an "
+		       "existing free space area. existing: "
+		       "offset=%llu, bytes=%llu. new: offset=%llu, "
+		       "bytes=%llu\n", (unsigned long long)right_info->offset,
+		       (unsigned long long)right_info->bytes,
+		       (unsigned long long)offset,
+		       (unsigned long long)bytes);
 		BUG();
 	}
 
@@ -225,11 +228,14 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 
 		if (unlikely((left_info->offset + left_info->bytes) !=
 			     offset)) {
-			printk(KERN_ERR "free space to the left of new free "
-			       "space isn't quite right. existing: offset=%Lu,"
-			       " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
-			       left_info->offset, left_info->bytes, offset,
-			       bytes);
+			printk(KERN_ERR "btrfs free space to the left "
+			       "of new free space isn't "
+			       "quite right. existing: offset=%llu, "
+			       "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+			       (unsigned long long)left_info->offset,
+			       (unsigned long long)left_info->bytes,
+			       (unsigned long long)offset,
+			       (unsigned long long)bytes);
 			BUG();
 		}
 
@@ -265,8 +271,7 @@ out:
 			BUG();
 	}
 
-	if (alloc_info)
-		kfree(alloc_info);
+	kfree(alloc_info);
 
 	return ret;
 }
@@ -283,9 +288,11 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (info && info->offset == offset) {
 		if (info->bytes < bytes) {
-			printk(KERN_ERR "Found free space at %Lu, size %Lu,"
-			       "trying to use %Lu\n",
-			       info->offset, info->bytes, bytes);
+			printk(KERN_ERR "Found free space at %llu, size %llu,"
+			       "trying to use %llu\n",
+			       (unsigned long long)info->offset,
+			       (unsigned long long)info->bytes,
+			       (unsigned long long)bytes);
 			WARN_ON(1);
 			ret = -EINVAL;
 			goto out;
@@ -401,8 +408,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
-		//printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
-		//       info->bytes);
 	}
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 80038c5ef7c..2aa79873eb4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -129,7 +129,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 		last_ino = key.objectid + 1;
 		path->slots[0]++;
 	}
-	// FIXME -ENOSPC
 	BUG_ON(1);
 found:
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 068bad46338..1b35ea63b6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -124,7 +124,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
  * the btree.  The caller should have done a btrfs_drop_extents so that
  * no overlapping inline items exist in the btree
  */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
 				u64 start, size_t size, size_t compressed_size,
 				struct page **compressed_pages)
@@ -148,7 +148,8 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 		cur_size = compressed_size;
 	}
 
-	path = btrfs_alloc_path(); if (!path)
+	path = btrfs_alloc_path();
+	if (!path)
 		return -ENOMEM;
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -165,7 +166,6 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	if (ret) {
 		err = ret;
-		printk("got bad ret %d\n", ret);
 		goto fail;
 	}
 	leaf = path->nodes[0];
@@ -181,7 +181,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 	if (use_compress) {
 		struct page *cpage;
 		int i = 0;
-		while(compressed_size > 0) {
+		while (compressed_size > 0) {
 			cpage = compressed_pages[i];
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
@@ -519,8 +519,7 @@ free_pages_out:
 		WARN_ON(pages[i]->mapping);
 		page_cache_release(pages[i]);
 	}
-	if (pages)
-		kfree(pages);
+	kfree(pages);
 
 	goto out;
 }
@@ -549,7 +548,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
 	trans = btrfs_join_transaction(root, 1);
 
-	while(!list_empty(&async_cow->extents)) {
+	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
 					  struct async_extent, list);
 		list_del(&async_extent->list);
@@ -562,8 +561,8 @@ static noinline int submit_compressed_extents(struct inode *inode,
 			unsigned long nr_written = 0;
 
 			lock_extent(io_tree, async_extent->start,
-				    async_extent->start + async_extent->ram_size - 1,
-				    GFP_NOFS);
+				    async_extent->start +
+				    async_extent->ram_size - 1, GFP_NOFS);
 
 			/* allocate blocks */
 			cow_file_range(inode, async_cow->locked_page,
@@ -581,7 +580,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 			if (!page_started)
 				extent_write_locked_range(io_tree,
 						  inode, async_extent->start,
-					          async_extent->start +
+						  async_extent->start +
 						  async_extent->ram_size - 1,
 						  btrfs_get_extent,
 						  WB_SYNC_ALL);
@@ -618,7 +617,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
-		while(1) {
+		while (1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
 			spin_unlock(&em_tree->lock);
@@ -651,11 +650,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
 					     NULL, 1, 1, 0, 1, 1, 0);
 
 		ret = btrfs_submit_compressed_write(inode,
-				         async_extent->start,
-					 async_extent->ram_size,
-					 ins.objectid,
-					 ins.offset, async_extent->pages,
-					 async_extent->nr_pages);
+				    async_extent->start,
+				    async_extent->ram_size,
+				    ins.objectid,
+				    ins.offset, async_extent->pages,
+				    async_extent->nr_pages);
 
 		BUG_ON(ret);
 		trans = btrfs_join_transaction(root, 1);
@@ -735,14 +734,13 @@ static noinline int cow_file_range(struct inode *inode,
 
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
-	while(disk_num_bytes > 0) {
+	while (disk_num_bytes > 0) {
 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
-		if (ret) {
-			BUG();
-		}
+		BUG_ON(ret);
+
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 		em->orig_start = em->start;
@@ -755,7 +753,7 @@ static noinline int cow_file_range(struct inode *inode,
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-		while(1) {
+		while (1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
 			spin_unlock(&em_tree->lock);
@@ -779,11 +777,9 @@ static noinline int cow_file_range(struct inode *inode,
 			BUG_ON(ret);
 		}
 
-		if (disk_num_bytes < cur_alloc_size) {
-			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
-			       cur_alloc_size);
+		if (disk_num_bytes < cur_alloc_size)
 			break;
-		}
+
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
@@ -842,9 +838,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
-	if (async_cow->inode) {
+	if (async_cow->inode)
 		submit_compressed_extents(async_cow->inode, async_cow);
-	}
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
@@ -871,7 +866,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
 			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
-	while(start < end) {
+	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
 		async_cow->root = root;
@@ -904,7 +899,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 			    limit));
 		}
 
-		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		while (atomic_read(&root->fs_info->async_submit_draining) &&
 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
 			wait_event(root->fs_info->async_submit_wait,
 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
@@ -918,7 +913,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	return 0;
 }
 
-static int noinline csum_exist_in_range(struct btrfs_root *root,
+static noinline int csum_exist_in_range(struct btrfs_root *root,
 					u64 bytenr, u64 num_bytes)
 {
 	int ret;
@@ -1146,13 +1141,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 
 	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-                                        page_started, 1, nr_written);
+					 page_started, 1, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-                                        page_started, 0, nr_written);
+					 page_started, 0, nr_written);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
-				     page_started, nr_written);
+					   page_started, nr_written);
 
 	return ret;
 }
@@ -1200,8 +1195,11 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 
 		spin_lock(&root->fs_info->delalloc_lock);
 		if (end - start + 1 > root->fs_info->delalloc_bytes) {
-			printk("warning: delalloc account %Lu %Lu\n",
-			       end - start + 1, root->fs_info->delalloc_bytes);
+			printk(KERN_INFO "btrfs warning: delalloc account "
+			       "%llu %llu\n",
+			       (unsigned long long)end - start + 1,
+			       (unsigned long long)
+			       root->fs_info->delalloc_bytes);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
@@ -1241,9 +1239,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	ret = btrfs_map_block(map_tree, READ, logical,
 			      &map_length, NULL, 0);
 
-	if (map_length < length + size) {
+	if (map_length < length + size)
 		return 1;
-	}
 	return 0;
 }
 
@@ -1255,8 +1252,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1341,9 +1339,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 {
-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
 		WARN_ON(1);
-	}
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   GFP_NOFS);
 }
@@ -1755,14 +1752,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
-	if (ret) {
+	if (ret)
 		goto zeroit;
-	}
+
 	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
-	if (csum != private) {
+	if (csum != private)
 		goto zeroit;
-	}
+
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
 good:
@@ -1773,9 +1770,10 @@ good:
 	return 0;
 
 zeroit:
-	printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
-	       page->mapping->host->i_ino, (unsigned long long)start, csum,
-	       private);
+	printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+	       "private %llu\n", page->mapping->host->i_ino,
+	       (unsigned long long)start, csum,
+	       (unsigned long long)private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
@@ -2097,9 +2095,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
  * copy everything in the in-memory inode into the btree.
  */
-int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *inode)
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_path *path;
@@ -2174,7 +2171,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 				  inode->i_ino,
 				  dir->i_ino, &index);
 	if (ret) {
-		printk("failed to delete reference to %.*s, "
+		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
 		       "inode %lu parent %lu\n", name_len, name,
 		       inode->i_ino, dir->i_ino);
 		goto err;
@@ -2280,9 +2277,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
-	if (!err) {
+	if (!err)
 		btrfs_i_size_write(inode, 0);
-	}
 
 fail_trans:
 	nr = trans->blocks_used;
@@ -2516,9 +2512,9 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
+	if (ret < 0)
 		goto error;
-	}
+
 	if (ret > 0) {
 		/* there are no items in the tree for us to truncate, we're
 		 * done
@@ -2530,7 +2526,7 @@ search_again:
 		path->slots[0]--;
 	}
 
-	while(1) {
+	while (1) {
 		fi = NULL;
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -2562,19 +2558,18 @@ search_again:
 			item_end--;
 		}
 		if (item_end < new_size) {
-			if (found_type == BTRFS_DIR_ITEM_KEY) {
+			if (found_type == BTRFS_DIR_ITEM_KEY)
 				found_type = BTRFS_INODE_ITEM_KEY;
-			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
+			else if (found_type == BTRFS_EXTENT_ITEM_KEY)
 				found_type = BTRFS_EXTENT_DATA_KEY;
-			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			else if (found_type == BTRFS_EXTENT_DATA_KEY)
 				found_type = BTRFS_XATTR_ITEM_KEY;
-			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
+			else if (found_type == BTRFS_XATTR_ITEM_KEY)
 				found_type = BTRFS_INODE_REF_KEY;
-			} else if (found_type) {
+			else if (found_type)
 				found_type--;
-			} else {
+			else
 				break;
-			}
 			btrfs_set_key_type(&key, found_type);
 			goto next;
 		}
@@ -2656,7 +2651,7 @@ delete:
 				pending_del_nr++;
 				pending_del_slot = path->slots[0];
 			} else {
-				printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
+				BUG();
 			}
 		} else {
 			break;
@@ -2938,9 +2933,10 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 				    namelen, 0);
 	if (IS_ERR(di))
 		ret = PTR_ERR(di);
-	if (!di || IS_ERR(di)) {
+
+	if (!di || IS_ERR(di))
 		goto out_err;
-	}
+
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 out:
 	btrfs_free_path(path);
@@ -3020,8 +3016,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
-	return (args->ino == inode->i_ino &&
-		args->root == BTRFS_I(inode)->root);
+	return args->ino == inode->i_ino &&
+		args->root == BTRFS_I(inode)->root;
 }
 
 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
@@ -3085,7 +3081,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
-	struct inode * inode;
+	struct inode *inode;
 	struct btrfs_inode *bi = BTRFS_I(dir);
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
@@ -3385,9 +3381,8 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
 
 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
 		ret = btrfs_set_inode_index_count(dir);
-		if (ret) {
+		if (ret)
 			return ret;
-		}
 	}
 
 	*index = BTRFS_I(dir)->index_cnt;
@@ -3879,12 +3874,13 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
- * the ugly parts come from merging extents from the disk with the
- * in-ram representation.  This gets more complex because of the data=ordered code,
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
  * where the in-ram extents might be locked pending data=ordered completion.
  *
  * This also copies inline extents directly into the page.
  */
+
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -4081,7 +4077,7 @@ again:
 				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
 	} else {
-		printk("unkknown found_type %d\n", found_type);
+		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
 		WARN_ON(1);
 	}
 not_found:
@@ -4093,7 +4089,11 @@ not_found_em:
 insert:
 	btrfs_release_path(root, path);
 	if (em->start > start || extent_map_end(em) <= start) {
-		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
+		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+		       "[%llu %llu]\n", (unsigned long long)em->start,
+		       (unsigned long long)em->len,
+		       (unsigned long long)start,
+		       (unsigned long long)len);
 		err = -EIO;
 		goto out;
 	}
@@ -4130,8 +4130,6 @@ insert:
 				}
 			} else {
 				err = -EIO;
-				printk("failing to insert %Lu %Lu\n",
-				       start, len);
 				free_extent_map(em);
 				em = NULL;
 			}
@@ -4147,9 +4145,8 @@ out:
 		btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
-		if (!err) {
+		if (!err)
 			err = ret;
-		}
 	}
 	if (err) {
 		free_extent_map(em);
@@ -4482,13 +4479,15 @@ void btrfs_destroy_inode(struct inode *inode)
 	}
 	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
-	while(1) {
+	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
 			break;
 		else {
-			printk("found ordered extent %Lu %Lu\n",
-			       ordered->file_offset, ordered->len);
+			printk(KERN_ERR "btrfs found ordered "
+			       "extent %llu %llu on inode cleanup\n",
+			       (unsigned long long)ordered->file_offset,
+			       (unsigned long long)ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -4572,8 +4571,8 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	return 0;
 }
 
-static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -4663,7 +4662,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 		return -EROFS;
 
 	spin_lock(&root->fs_info->delalloc_lock);
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
@@ -4684,7 +4683,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	 * ordered extents get created before we return
 	 */
 	atomic_inc(&root->fs_info->async_submit_draining);
-	while(atomic_read(&root->fs_info->nr_async_submits) ||
+	while (atomic_read(&root->fs_info->nr_async_submits) ||
 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
 		wait_event(root->fs_info->async_submit_wait,
 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba484aac1b9..c2aa33e3feb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -311,7 +311,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		 * to see if is references the subvolume where we are
 		 * placing this new snapshot.
 		 */
-		while(1) {
+		while (1) {
 			if (!test ||
 			    dir == snap_src->fs_info->sb->s_root ||
 			    test == snap_src->fs_info->sb->s_root ||
@@ -319,7 +319,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 				break;
 			}
 			if (S_ISLNK(test->d_inode->i_mode)) {
-				printk("Symlink in snapshot path, failed\n");
+				printk(KERN_INFO "Btrfs symlink in snapshot "
+				       "path, failed\n");
 				error = -EMLINK;
 				btrfs_free_path(path);
 				goto out_drop_write;
@@ -329,7 +330,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
 				  path, test_oid, parent_oid);
 			if (ret == 0) {
-				printk("Snapshot creation failed, looping\n");
+				printk(KERN_INFO "Btrfs snapshot creation "
+				       "failed, looping\n");
 				error = -EMLINK;
 				btrfs_free_path(path);
 				goto out_drop_write;
@@ -617,7 +619,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
 		src_inode = src_file->f_path.dentry->d_inode;
 		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
-			printk("btrfs: Snapshot src from another FS\n");
+			printk(KERN_INFO "btrfs: Snapshot src from "
+			       "another FS\n");
 			ret = -EINVAL;
 			fput(src_file);
 			goto out;
@@ -810,9 +813,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	    ((off + len) & (bs-1)))
 		goto out_unlock;
 
-	printk("final src extent is %llu~%llu\n", off, len);
-	printk("final dst extent is %llu~%llu\n", destoff, len);
-
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
@@ -883,10 +883,13 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			comp = btrfs_file_extent_compression(leaf, extent);
 			type = btrfs_file_extent_type(leaf, extent);
 			if (type == BTRFS_FILE_EXTENT_REG) {
-				disko = btrfs_file_extent_disk_bytenr(leaf, extent);
-				diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+				disko = btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
 				datao = btrfs_file_extent_offset(leaf, extent);
-				datal = btrfs_file_extent_num_bytes(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf,
+								    extent);
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				/* take upper bound, may be compressed */
 				datal = btrfs_file_extent_ram_bytes(leaf,
@@ -916,8 +919,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 				extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
-				printk("  orig disk %llu~%llu data %llu~%llu\n",
-				       disko, diskl, datao, datal);
 
 				if (off > key.offset) {
 					datao += off - key.offset;
@@ -929,8 +930,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				/* disko == 0 means it's a hole */
 				if (!disko)
 					datao = 0;
-				printk(" final disk %llu~%llu data %llu~%llu\n",
-				       disko, diskl, datao, datal);
 
 				btrfs_set_file_extent_offset(leaf, extent,
 							     datao);
@@ -952,12 +951,11 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					skip = off - key.offset;
 					new_key.offset += skip;
 				}
+
 				if (key.offset + datal > off+len)
 					trim = key.offset + datal - (off+len);
-				printk("len %lld skip %lld trim %lld\n",
-				       datal, skip, trim);
+
 				if (comp && (skip || trim)) {
-					printk("btrfs clone_range can't split compressed inline extents yet\n");
 					ret = -EINVAL;
 					goto out;
 				}
@@ -969,7 +967,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					goto out;
 
 				if (skip) {
-					u32 start = btrfs_file_extent_calc_inline_size(0);
+					u32 start =
+					  btrfs_file_extent_calc_inline_size(0);
 					memmove(buf+start, buf+start+skip,
 						datal);
 				}
@@ -985,7 +984,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 
-	next:
+next:
 		btrfs_release_path(root, path);
 		key.offset++;
 	}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e30aa6e2958..39bae7761db 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -31,9 +31,10 @@
  * difference in almost every workload, but spinning for the right amount of
  * time needs some help.
  *
- * In general, we want to spin as long as the lock holder is doing btree searches,
- * and we should give up if they are in more expensive code.
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
  */
+
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
 	int i;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d9e232227da..a2094017027 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -39,11 +39,11 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct btrfs_ordered_extent *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
 
@@ -67,13 +67,13 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 				     struct rb_node **prev_ret)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *test;
 	struct btrfs_ordered_extent *entry;
 	struct btrfs_ordered_extent *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -88,7 +88,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	if (!prev_ret)
 		return NULL;
 
-	while(prev && file_offset >= entry_end(prev_entry)) {
+	while (prev && file_offset >= entry_end(prev_entry)) {
 		test = rb_next(prev);
 		if (!test)
 			break;
@@ -102,7 +102,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	if (prev)
 		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
 				      rb_node);
-	while(prev && file_offset < entry_end(prev_entry)) {
+	while (prev && file_offset < entry_end(prev_entry)) {
 		test = rb_prev(prev);
 		if (!test)
 			break;
@@ -193,10 +193,8 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
-	if (node) {
-		printk("warning dup entry from add_ordered_extent\n");
-		BUG();
-	}
+	BUG_ON(node);
+
 	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
 			   entry_end(entry) - 1, GFP_NOFS);
 
@@ -282,7 +280,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 	struct btrfs_ordered_sum *sum;
 
 	if (atomic_dec_and_test(&entry->refs)) {
-		while(!list_empty(&entry->list)) {
+		while (!list_empty(&entry->list)) {
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
 			list_del(&sum->list);
@@ -432,11 +430,10 @@ again:
 					   orig_end >> PAGE_CACHE_SHIFT);
 
 	end = orig_end;
-	while(1) {
+	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
-		if (!ordered) {
+		if (!ordered)
 			break;
-		}
 		if (ordered->file_offset > orig_end) {
 			btrfs_put_ordered_extent(ordered);
 			break;
@@ -492,7 +489,7 @@ out:
  * if none is found
  */
 struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -553,7 +550,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * yet
 	 */
 	node = &ordered->rb_node;
-	while(1) {
+	while (1) {
 		node = rb_prev(node);
 		if (!node)
 			break;
@@ -581,9 +578,8 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 		 * between our ordered extent and the next one.
 		 */
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (test->file_offset > entry_end(ordered)) {
+		if (test->file_offset > entry_end(ordered))
 			i_size_test = test->file_offset;
-		}
 	} else {
 		i_size_test = i_size_read(inode);
 	}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 64725c13aa1..5f8f218c100 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -24,13 +24,14 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 {
 	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
 	int i;
-	printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
+	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+	       "num_stripes %d\n",
 	       (unsigned long long)btrfs_chunk_length(eb, chunk),
 	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
 	       (unsigned long long)btrfs_chunk_type(eb, chunk),
 	       num_stripes);
 	for (i = 0 ; i < num_stripes ; i++) {
-		printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
+		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
 		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
 		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
 	}
@@ -38,8 +39,8 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 static void print_dev_item(struct extent_buffer *eb,
 			   struct btrfs_dev_item *dev_item)
 {
-	printk("\t\tdev item devid %llu "
-	       "total_bytes %llu bytes used %Lu\n",
+	printk(KERN_INFO "\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %llu\n",
 	       (unsigned long long)btrfs_device_id(eb, dev_item),
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
@@ -61,14 +62,15 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_dev_extent *dev_extent;
 	u32 type;
 
-	printk("leaf %llu total ptrs %d free space %d\n",
+	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(l, i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
-		printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
+		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		       "itemsize %d\n",
 			i,
 			(unsigned long long)key.objectid, type,
 			(unsigned long long)key.offset,
@@ -76,33 +78,36 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-			printk("\t\tinode generation %llu size %llu mode %o\n",
-		              (unsigned long long)btrfs_inode_generation(l, ii),
+			printk(KERN_INFO "\t\tinode generation %llu size %llu "
+			       "mode %o\n",
+			       (unsigned long long)
+			       btrfs_inode_generation(l, ii),
 			      (unsigned long long)btrfs_inode_size(l, ii),
 			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
-			printk("\t\tdir oid %llu type %u\n",
+			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
 				(unsigned long long)found_key.objectid,
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-			printk("\t\troot data bytenr %llu refs %u\n",
-				(unsigned long long)btrfs_disk_root_bytenr(l, ri),
+			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+				(unsigned long long)
+				btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printk("\t\textent data refs %u\n",
+			printk(KERN_INFO "\t\textent data refs %u\n",
 				btrfs_extent_refs(l, ei));
 			break;
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
-			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu num_refs %lu\n",
+			printk(KERN_INFO "\t\textent back ref root %llu "
+			       "gen %llu owner %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
@@ -114,26 +119,36 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
-				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l, fi));
+				printk(KERN_INFO "\t\tinline extent data "
+				       "size %u\n",
+				       btrfs_file_extent_inline_len(l, fi));
 				break;
 			}
-			printk("\t\textent data disk bytenr %llu nr %llu\n",
-			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
-			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-			printk("\t\textent data offset %llu nr %llu ram %llu\n",
-			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
-			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data disk bytenr %llu "
+			       "nr %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_disk_bytenr(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_disk_num_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data offset %llu "
+			       "nr %llu ram %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_offset(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_num_bytes(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
-			printk("\t\tblock group used %llu\n",
-			       (unsigned long long)btrfs_disk_block_group_used(l, bi));
+			printk(KERN_INFO "\t\tblock group used %llu\n",
+			       (unsigned long long)
+			       btrfs_disk_block_group_used(l, bi));
 			break;
 		case BTRFS_CHUNK_ITEM_KEY:
-			print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
+			print_chunk(l, btrfs_item_ptr(l, i,
+						      struct btrfs_chunk));
 			break;
 		case BTRFS_DEV_ITEM_KEY:
 			print_dev_item(l, btrfs_item_ptr(l, i,
@@ -142,7 +157,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_EXTENT_KEY:
 			dev_extent = btrfs_item_ptr(l, i,
 						    struct btrfs_dev_extent);
-			printk("\t\tdev extent chunk_tree %llu\n"
+			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
 			       "\t\tchunk objectid %llu chunk offset %llu "
 			       "length %llu\n",
 			       (unsigned long long)
@@ -171,13 +186,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		btrfs_print_leaf(root, c);
 		return;
 	}
-	printk("node %llu level %d total ptrs %d free spc %u\n",
+	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
 	       (unsigned long long)btrfs_header_bytenr(c),
 	       btrfs_header_level(c), nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
-		printk("\tkey %d (%llu %u %llu) block %llu\n",
+		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
 		       i,
 		       (unsigned long long)key.objectid,
 		       key.type,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a50ebb67055..6f0acc4c9ea 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -74,11 +74,11 @@ void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct btrfs_leaf_ref *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
 
@@ -98,10 +98,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 
 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct btrfs_leaf_ref *entry;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
@@ -127,7 +127,7 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 		return 0;
 
 	spin_lock(&tree->lock);
-	while(!list_empty(&tree->list)) {
+	while (!list_empty(&tree->list)) {
 		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
 		BUG_ON(ref->tree != tree);
 		if (ref->root_gen > max_root_gen)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f99335a999d..b48650de447 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -132,8 +132,9 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("unable to update root key %Lu %u %Lu\n",
-		       key->objectid, key->type, key->offset);
+		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+		       (unsigned long long)key->objectid, key->type,
+		       (unsigned long long)key->offset);
 		BUG_ON(1);
 	}
 
@@ -159,9 +160,9 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 /*
  * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an offset
- * lower than the latest root.  They need to be queued for deletion to finish
- * what was happening when we crashed.
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
  */
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest)
@@ -188,7 +189,7 @@ again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
@@ -258,11 +259,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-	if (ret) {
-btrfs_print_leaf(root, path->nodes[0]);
-printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
 
-	}
 	BUG_ON(ret != 0);
 	leaf = path->nodes[0];
 	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 8d7f568009c..c0f7ecaf1e7 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -66,7 +66,7 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
 		unsigned long map_len;					\
 		u##bits res;						\
 		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
+				sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
@@ -103,7 +103,7 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
 		unsigned long map_start;				\
 		unsigned long map_len;					\
 		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
+				sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ccdcb7bb7ad..b4c101d9322 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,18 +55,12 @@
 
 static struct super_operations btrfs_super_ops;
 
-static void btrfs_put_super (struct super_block * sb)
+static void btrfs_put_super(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
 	ret = close_ctree(root);
-	if (ret) {
-		printk("close ctree returns %d\n", ret);
-	}
-#if 0
-	btrfs_sysfs_del_super(root->fs_info);
-#endif
 	sb->s_fs_info = NULL;
 }
 
@@ -299,12 +293,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	return error;
 }
 
-static int btrfs_fill_super(struct super_block * sb,
+static int btrfs_fill_super(struct super_block *sb,
 			    struct btrfs_fs_devices *fs_devices,
-			    void * data, int silent)
+			    void *data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root_dentry;
+	struct inode *inode;
+	struct dentry *root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
 	struct btrfs_inode *bi;
@@ -479,8 +473,10 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		root = dget(s->s_root);
 	else {
 		mutex_lock(&s->s_root->d_inode->i_mutex);
-		root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
+		root = lookup_one_len(subvol_name, s->s_root,
+				      strlen(subvol_name));
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
+
 		if (IS_ERR(root)) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
@@ -557,8 +553,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
+
 	/* We treat it as constant endianness (it doesn't matter _which_)
-	   because we want the fsid to come out the same whether mounted 
+	   because we want the fsid to come out the same whether mounted
 	   on a big-endian or little-endian host */
 	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
 	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
@@ -658,7 +655,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk("misc_deregister failed for control device");
+		printk(KERN_INFO "misc_deregister failed for control device");
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 04087c02084..a240b6fa81d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -67,7 +67,8 @@ struct btrfs_root_attr {
 };
 
 #define ROOT_ATTR(name, mode, show, store) \
-static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+							      show, store)
 
 ROOT_ATTR(blocks_used,	0444,	root_blocks_used_show,	NULL);
 ROOT_ATTR(block_limit,	0644,	root_block_limit_show,	NULL);
@@ -86,7 +87,8 @@ struct btrfs_super_attr {
 };
 
 #define SUPER_ATTR(name, mode, show, store) \
-static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+								show, store)
 
 SUPER_ATTR(blocks_used,		0444,	super_blocks_used_show,		NULL);
 SUPER_ATTR(total_blocks,	0444,	super_total_blocks_show,	NULL);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4e7b56e9d3a..56ab1f5ea11 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,9 +28,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_transaction_cachep;
-
 #define BTRFS_ROOT_TRANS_TAG 0
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
@@ -85,10 +82,10 @@ static noinline int join_transaction(struct btrfs_root *root)
 }
 
 /*
- * this does all the record keeping required to make sure that a
- * reference counted root is properly recorded in a given transaction.
- * This is required to make sure the old root from before we joined the transaction
- * is deleted when the transaction commits
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
  */
 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
@@ -144,7 +141,7 @@ static void wait_current_trans(struct btrfs_root *root)
 	if (cur_trans && cur_trans->blocked) {
 		DEFINE_WAIT(wait);
 		cur_trans->use_count++;
-		while(1) {
+		while (1) {
 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 			if (cur_trans->blocked) {
@@ -213,7 +210,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 {
 	DEFINE_WAIT(wait);
 	mutex_lock(&root->fs_info->trans_mutex);
-	while(!commit->commit_done) {
+	while (!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (commit->commit_done)
@@ -228,8 +225,8 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 }
 
 /*
- * rate limit against the drop_snapshot code.  This helps to slow down new operations
- * if the drop_snapshot code isn't able to keep up.
+ * rate limit against the drop_snapshot code.  This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
  */
 static void throttle_on_drops(struct btrfs_root *root)
 {
@@ -332,12 +329,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	u64 end;
 	unsigned long index;
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		while(start <= end) {
+		while (start <= end) {
 			cond_resched();
 
 			index = start >> PAGE_CACHE_SHIFT;
@@ -368,14 +365,14 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
 
 		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
-		while(start <= end) {
+		while (start <= end) {
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 			page = find_get_page(btree_inode->i_mapping, index);
@@ -431,7 +428,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	btrfs_write_dirty_block_groups(trans, root);
 	btrfs_extent_post_op(trans, root);
 
-	while(1) {
+	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
 			break;
@@ -472,7 +469,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 	btrfs_extent_post_op(trans, fs_info->tree_root);
 
-	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
+	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
@@ -521,7 +518,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 	int err = 0;
 	u32 refs;
 
-	while(1) {
+	while (1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 						 ARRAY_SIZE(gang),
 						 BTRFS_ROOT_TRANS_TAG);
@@ -653,7 +650,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	int ret = 0;
 	int err;
 
-	while(!list_empty(list)) {
+	while (!list_empty(list)) {
 		struct btrfs_root *root;
 
 		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
@@ -663,13 +660,12 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		root = dirty->latest_root;
 		atomic_inc(&root->fs_info->throttles);
 
-		while(1) {
+		while (1) {
 			trans = btrfs_start_transaction(tree_root, 1);
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
-			if (ret != -EAGAIN) {
+			if (ret != -EAGAIN)
 				break;
-			}
 			mutex_unlock(&root->fs_info->drop_mutex);
 
 			err = btrfs_update_root(trans,
@@ -874,7 +870,7 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 	struct list_head *head = &trans->transaction->pending_snapshots;
 	int ret;
 
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		pending = list_entry(head->next,
 				     struct btrfs_pending_snapshot, list);
 		ret = finish_pending_snapshot(fs_info, pending);
@@ -1076,9 +1072,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	if (root->fs_info->closing) {
+	if (root->fs_info->closing)
 		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
-	}
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ffe7f639732..ea292117f88 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -66,9 +66,9 @@ static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 	trans->block_group = BTRFS_I(inode)->block_group;
 }
 
-static inline void btrfs_update_inode_block_group(struct
-						  btrfs_trans_handle *trans,
-						  struct inode *inode)
+static inline void btrfs_update_inode_block_group(
+					  struct btrfs_trans_handle *trans,
+					  struct inode *inode)
 {
 	BTRFS_I(inode)->block_group = trans->block_group;
 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a6a3956cedf..3e8358c3616 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,10 +23,11 @@
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read things
- * from disk, otherwise read all the leaves and try to get key order to
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
  * better reflect disk order
  */
+
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
@@ -65,9 +66,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(root->node);
 	orig_level = level;
 
-	if (level == 0) {
+	if (level == 0)
 		goto out;
-	}
+
 	if (root->defrag_progress.objectid == 0) {
 		struct extent_buffer *root_node;
 		u32 nritems;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b1c2921f5be..3a72a1b6c24 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -829,7 +829,7 @@ conflict_again:
 		 */
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
-		while(ptr < ptr_end) {
+		while (ptr < ptr_end) {
 			victim_ref = (struct btrfs_inode_ref *)ptr;
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
 								   victim_ref);
@@ -938,9 +938,8 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 
 	file_bytes = (item_size / csum_size) * root->sectorsize;
 	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
-	if (!sums) {
+	if (!sums)
 		return -ENOMEM;
-	}
 
 	INIT_LIST_HEAD(&sums->list);
 	sums->len = file_bytes;
@@ -952,7 +951,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	sector_sum = sums->sums;
 	cur_offset = key->offset;
 	ptr = btrfs_item_ptr_offset(eb, slot);
-	while(item_size > 0) {
+	while (item_size > 0) {
 		sector_sum->bytenr = cur_offset;
 		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
@@ -995,7 +994,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
@@ -1012,7 +1011,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
 						   path->slots[0]);
-		while(ptr < ptr_end) {
+		while (ptr < ptr_end) {
 			struct btrfs_inode_ref *ref;
 
 			ref = (struct btrfs_inode_ref *)ptr;
@@ -1048,7 +1047,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
 	key.offset = (u64)-1;
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0)
 			break;
@@ -1206,8 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	if (key->type == BTRFS_DIR_ITEM_KEY) {
 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
 				       name, name_len, 1);
-	}
-	else if (key->type == BTRFS_DIR_INDEX_KEY) {
+	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
 						     key->objectid,
 						     key->offset, name,
@@ -1282,7 +1280,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
-	while(ptr < ptr_end) {
+	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
@@ -1408,7 +1406,7 @@ again:
 	item_size = btrfs_item_size_nr(eb, slot);
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
-	while(ptr < ptr_end) {
+	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
 		name_len = btrfs_dir_name_len(eb, di);
 		name = kmalloc(name_len, GFP_NOFS);
@@ -1513,14 +1511,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 again:
 	range_start = 0;
 	range_end = 0;
-	while(1) {
+	while (1) {
 		ret = find_dir_range(log, path, dirid, key_type,
 				     &range_start, &range_end);
 		if (ret != 0)
 			break;
 
 		dir_key.offset = range_start;
-		while(1) {
+		while (1) {
 			int nritems;
 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
 						0, 0);
@@ -1676,7 +1674,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	return 0;
 }
 
-static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level,
 				   struct walk_control *wc)
@@ -1694,7 +1692,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	while(*level > 0) {
+	while (*level > 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -1753,11 +1751,11 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	if (path->nodes[*level] == root->node) {
+	if (path->nodes[*level] == root->node)
 		parent = path->nodes[*level];
-	} else {
+	else
 		parent = path->nodes[*level + 1];
-	}
+
 	bytenr = path->nodes[*level]->start;
 
 	blocksize = btrfs_level_size(root, *level);
@@ -1790,7 +1788,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path, int *level,
 				 struct walk_control *wc)
@@ -1801,7 +1799,7 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 	int slot;
 	int ret;
 
-	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
@@ -1875,7 +1873,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	extent_buffer_get(log->node);
 	path->slots[level] = 0;
 
-	while(1) {
+	while (1) {
 		wret = walk_down_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
@@ -1941,7 +1939,7 @@ static int wait_log_commit(struct btrfs_root *log)
 			schedule();
 		finish_wait(&log->fs_info->tree_log_wait, &wait);
 		mutex_lock(&log->fs_info->tree_log_mutex);
-	} while(transid == log->fs_info->tree_log_transid &&
+	} while (transid == log->fs_info->tree_log_transid &&
 		atomic_read(&log->fs_info->tree_log_commit));
 	return 0;
 }
@@ -1965,13 +1963,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 	atomic_set(&log->fs_info->tree_log_commit, 1);
 
-	while(1) {
+	while (1) {
 		batch = log->fs_info->tree_log_batch;
 		mutex_unlock(&log->fs_info->tree_log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&log->fs_info->tree_log_mutex);
 
-		while(atomic_read(&log->fs_info->tree_log_writers)) {
+		while (atomic_read(&log->fs_info->tree_log_writers)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
@@ -2030,7 +2028,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	ret = walk_log_tree(trans, log, &wc);
 	BUG_ON(ret);
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
 				    0, &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -2287,9 +2285,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			struct btrfs_key tmp;
 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
 					      path->slots[0]);
-			if (key_type == tmp.type) {
+			if (key_type == tmp.type)
 				first_offset = max(min_offset, tmp.offset) + 1;
-			}
 		}
 		goto done;
 	}
@@ -2319,7 +2316,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * we have a block from this transaction, log every item in it
 	 * from our directory
 	 */
-	while(1) {
+	while (1) {
 		struct btrfs_key tmp;
 		src = path->nodes[0];
 		nritems = btrfs_header_nritems(src);
@@ -2396,7 +2393,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 again:
 	min_key = 0;
 	max_key = 0;
-	while(1) {
+	while (1) {
 		ret = log_dir_items(trans, root, inode, path,
 				    dst_path, key_type, min_key,
 				    &max_key);
@@ -2432,7 +2429,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	key.type = max_key_type;
 	key.offset = (u64)-1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
 
 		if (ret != 1)
@@ -2481,7 +2478,7 @@ static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
 	list_add_tail(&sums->list, list);
 
 	path = btrfs_alloc_path();
-	while(disk_bytenr < end) {
+	while (disk_bytenr < end) {
 		if (!item || disk_bytenr < item_start_offset ||
 		    disk_bytenr >= item_last_offset) {
 			struct btrfs_key found_key;
@@ -2496,7 +2493,8 @@ static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
 				if (ret == -ENOENT || ret == -EFBIG)
 					ret = 0;
 				sum = 0;
-				printk("log no csum found for byte %llu\n",
+				printk(KERN_INFO "log no csum found for "
+				       "byte %llu\n",
 				       (unsigned long long)disk_bytenr);
 				item = NULL;
 				btrfs_release_path(root, path);
@@ -2643,7 +2641,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	 * we have to do this after the loop above to avoid changing the
 	 * log tree while trying to change the log tree.
 	 */
-	while(!list_empty(&ordered_sums)) {
+	while (!list_empty(&ordered_sums)) {
 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
 						   struct btrfs_ordered_sum,
 						   list);
@@ -2736,7 +2734,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
 					   path, 0, trans->transid);
@@ -2848,7 +2846,7 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 
 	start_log_trans(trans, root);
 	sb = dentry->d_inode->i_sb;
-	while(1) {
+	while (1) {
 		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
 					inode_only);
 		BUG_ON(ret);
@@ -2919,7 +2917,7 @@ again:
 	key.offset = (u64)-1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 		if (ret < 0)
 			break;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6672adcec9f..b187b537888 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -140,7 +140,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-static int noinline run_scheduled_bios(struct btrfs_device *device)
+static noinline int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -187,7 +187,7 @@ loop:
 	}
 	spin_unlock(&device->io_lock);
 
-	while(pending) {
+	while (pending) {
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -458,7 +458,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
-			printk("open %s failed\n", device->name);
+			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
 		}
 		set_blocksize(bdev, 4096);
@@ -570,14 +570,15 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
-		printk("device label %s ", disk_super->label);
+		printk(KERN_INFO "device label %s ", disk_super->label);
 	else {
 		/* FIXME, make a readl uuid parser */
-		printk("device fsid %llx-%llx ",
+		printk(KERN_INFO "device fsid %llx-%llx ",
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
+	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 	brelse(bh);
@@ -683,9 +684,8 @@ no_more_items:
 				goto check_pending;
 			}
 		}
-		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 			goto next;
-		}
 
 		start_found = 1;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1001,14 +1001,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
 	    root->fs_info->fs_devices->rw_devices <= 4) {
-		printk("btrfs: unable to go below four devices on raid10\n");
+		printk(KERN_ERR "btrfs: unable to go below four devices "
+		       "on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
 	    root->fs_info->fs_devices->rw_devices <= 2) {
-		printk("btrfs: unable to go below two devices on raid1\n");
+		printk(KERN_ERR "btrfs: unable to go below two "
+		       "devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1031,7 +1033,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk("btrfs: no missing devices found to remove\n");
+			printk(KERN_ERR "btrfs: no missing devices found to "
+			       "remove\n");
 			goto out;
 		}
 	} else {
@@ -1060,7 +1063,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk("btrfs: unable to remove the only writeable device\n");
+		printk(KERN_ERR "btrfs: unable to remove the only writeable "
+		       "device\n");
 		ret = -EINVAL;
 		goto error_brelse;
 	}
@@ -1286,9 +1290,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EINVAL;
 
 	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
-	if (!bdev) {
+	if (!bdev)
 		return -EIO;
-	}
 
 	if (root->fs_info->fs_devices->seeding) {
 		seeding_dev = 1;
@@ -1401,8 +1404,8 @@ error:
 	goto out;
 }
 
-static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
-				 struct btrfs_device *device)
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1563,7 +1566,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
-	printk("btrfs relocating chunk %llu\n",
+	printk(KERN_INFO "btrfs relocating chunk %llu\n",
 	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
@@ -1748,7 +1751,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -1916,7 +1919,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
 					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
@@ -2041,7 +2044,7 @@ again:
 		min_free += 1024 * 1024;
 
 	INIT_LIST_HEAD(&private_devs);
-	while(index < num_stripes) {
+	while (index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
@@ -2242,7 +2245,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root,
 					 struct btrfs_device *device)
 {
@@ -2338,7 +2341,7 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 {
 	struct extent_map *em;
 
-	while(1) {
+	while (1) {
 		spin_lock(&tree->map_tree.lock);
 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
 		if (em)
@@ -2413,9 +2416,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
-	if (multi_ret && !(rw & (1 << BIO_RW))) {
+	if (multi_ret && !(rw & (1 << BIO_RW)))
 		stripes_allocated = 1;
-	}
 again:
 	if (multi_ret) {
 		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
@@ -2434,7 +2436,9 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu len %Lu\n", logical, *length);
+		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		       (unsigned long long)logical,
+		       (unsigned long long)*length);
 		BUG();
 	}
 
@@ -2541,9 +2545,8 @@ again:
 			device = map->stripes[stripe_index].dev;
 			if (device->bdev) {
 				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn) {
+				if (bdi->unplug_io_fn)
 					bdi->unplug_io_fn(bdi, unplug_page);
-				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -2717,7 +2720,7 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static int noinline schedule_bio(struct btrfs_root *root,
+static noinline int schedule_bio(struct btrfs_root *root,
 				 struct btrfs_device *device,
 				 int rw, struct bio *bio)
 {
@@ -2785,8 +2788,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	total_devs = multi->num_stripes;
 	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu "
-		       "len %Lu\n", logical, length, map_length);
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
 		BUG();
 	}
 	multi->end_io = first_bio->bi_end_io;
@@ -2794,7 +2799,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
-	while(dev_nr < total_devs) {
+	while (dev_nr < total_devs) {
 		if (total_devs > 1) {
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
@@ -3058,7 +3063,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -EIO;
 
 		if (!device) {
-			printk("warning devid %Lu missing\n", devid);
+			printk(KERN_WARNING "warning devid %llu missing\n",
+			       (unsigned long long)devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -3078,12 +3084,6 @@ static int read_one_dev(struct btrfs_root *root,
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
-#if 0
-	ret = btrfs_open_device(device);
-	if (ret) {
-		kfree(device);
-	}
-#endif
 	return ret;
 }
 
@@ -3174,7 +3174,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	key.type = 0;
 again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 4146f0710e6..7f332e27089 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -264,7 +264,8 @@ struct xattr_handler *btrfs_xattr_handlers[] = {
  */
 static bool btrfs_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c4617cde6c7..ecfbce836d3 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -152,7 +152,7 @@ static int free_workspace(struct workspace *workspace)
 static void free_workspaces(void)
 {
 	struct workspace *workspace;
-	while(!list_empty(&idle_workspace)) {
+	while (!list_empty(&idle_workspace)) {
 		workspace = list_entry(idle_workspace.next, struct workspace,
 				       list);
 		list_del(&workspace->list);
@@ -397,12 +397,10 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 		ret = -1;
 		goto out;
 	}
-	while(workspace->inf_strm.total_in < srclen) {
+	while (workspace->inf_strm.total_in < srclen) {
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END) {
+		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		}
-
 		/*
 		 * buf start is the byte offset we're of the start of
 		 * our workspace buffer
@@ -424,16 +422,14 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 			/* we didn't make progress in this inflate
 			 * call, we're done
 			 */
-			if (ret != Z_STREAM_END) {
+			if (ret != Z_STREAM_END)
 				ret = -1;
-			}
 			break;
 		}
 
 		/* we haven't yet hit data corresponding to this page */
-		if (total_out <= start_byte) {
+		if (total_out <= start_byte)
 			goto next;
-		}
 
 		/*
 		 * the start of the data we care about is offset into
@@ -448,7 +444,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 		current_buf_start = buf_start;
 
 		/* copy bytes from the working buffer into the pages */
-		while(working_bytes > 0) {
+		while (working_bytes > 0) {
 			bytes = min(PAGE_CACHE_SIZE - pg_offset,
 				    PAGE_CACHE_SIZE - buf_offset);
 			bytes = min(bytes, working_bytes);
@@ -471,6 +467,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 					ret = 0;
 					goto done;
 				}
+
 				page_out = bvec[page_out_index].bv_page;
 				pg_offset = 0;
 				page_bytes_left = PAGE_CACHE_SIZE;
@@ -480,9 +477,8 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 				 * make sure our new page is covered by this
 				 * working buffer
 				 */
-				if (total_out <= start_byte) {
+				if (total_out <= start_byte)
 					goto next;
-				}
 
 				/* the next page in the biovec might not
 				 * be adjacent to the last page, but it
@@ -517,11 +513,10 @@ next:
 							   PAGE_CACHE_SIZE);
 		}
 	}
-	if (ret != Z_STREAM_END) {
+	if (ret != Z_STREAM_END)
 		ret = -1;
-	} else {
+	else
 		ret = 0;
-	}
 done:
 	zlib_inflateEnd(&workspace->inf_strm);
 	if (data_in)
@@ -579,16 +574,15 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 		goto out;
 	}
 
-	while(bytes_left > 0) {
+	while (bytes_left > 0) {
 		unsigned long buf_start;
 		unsigned long buf_offset;
 		unsigned long bytes;
 		unsigned long pg_offset = 0;
 
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END) {
+		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		}
 
 		buf_start = total_out;
 		total_out = workspace->inf_strm.total_out;
@@ -598,15 +592,13 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 			break;
 		}
 
-		if (total_out <= start_byte) {
+		if (total_out <= start_byte)
 			goto next;
-		}
 
-		if (total_out > start_byte && buf_start < start_byte) {
+		if (total_out > start_byte && buf_start < start_byte)
 			buf_offset = start_byte - buf_start;
-		} else {
+		else
 			buf_offset = 0;
-		}
 
 		bytes = min(PAGE_CACHE_SIZE - pg_offset,
 			    PAGE_CACHE_SIZE - buf_offset);
@@ -622,11 +614,12 @@ next:
 		workspace->inf_strm.next_out = workspace->buf;
 		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 	}
-	if (ret != Z_STREAM_END && bytes_left != 0) {
+
+	if (ret != Z_STREAM_END && bytes_left != 0)
 		ret = -1;
-	} else {
+	else
 		ret = 0;
-	}
+
 	zlib_inflateEnd(&workspace->inf_strm);
 out:
 	free_workspace(workspace);
-- 
cgit v1.2.3-70-g09d2


From 07d400a6df4767a90d49a153fdb7f4cfa1e3f23e Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 6 Jan 2009 11:42:00 -0500
Subject: Btrfs: tree logging checksum fixes

This patch contains following things.

1) Limit the max size of btrfs_ordered_sum structure to PAGE_SIZE.  This
struct is kmalloced so we want to keep it reasonable.

2) Replace copy_extent_csums by btrfs_lookup_csums_range.  This was
duplicated code in tree-log.c

3) Remove replay_one_csum. csum items are replayed at the same time as
   replaying file extents. This guarantees we only replay useful csums.

4) nbytes accounting fix.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c |   2 +-
 fs/btrfs/file-item.c   |  62 ++++++-----
 fs/btrfs/inode.c       |   5 +-
 fs/btrfs/tree-log.c    | 293 +++++++++++++++----------------------------------
 4 files changed, 130 insertions(+), 232 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 171ca30a375..293da650873 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5579,7 +5579,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
 
 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
-	ret = btrfs_lookup_csums_range(root, disk_bytenr,
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
 				       disk_bytenr + len - 1, &list);
 
 	while (!list_empty(&list)) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b11abfad81a..964652435fd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -27,6 +27,12 @@
 #define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+				   sizeof(struct btrfs_ordered_sum)) / \
+				   sizeof(struct btrfs_sector_sum) * \
+				   (r)->sectorsize - (r)->sectorsize)
+
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -259,8 +265,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	key.offset = start;
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 
-	ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
-				&key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto fail;
 	if (ret > 0 && path->slots[0] > 0) {
@@ -279,7 +284,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	while (start <= end) {
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(root->fs_info->csum_root, path);
+			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto fail;
 			if (ret > 0)
@@ -306,33 +311,38 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			continue;
 		}
 
-		size = min(csum_end, end + 1) - start;
-		sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
-		BUG_ON(!sums);
+		csum_end = min(csum_end, end + 1);
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (start < csum_end) {
+			size = min_t(size_t, csum_end - start,
+					MAX_ORDERED_SUM_BYTES(root));
+			sums = kzalloc(btrfs_ordered_sum_size(root, size),
+					GFP_NOFS);
+			BUG_ON(!sums);
 
-		sector_sum = sums->sums;
-		sums->bytenr = start;
-		sums->len = size;
+			sector_sum = sums->sums;
+			sums->bytenr = start;
+			sums->len = size;
 
-		offset = (start - key.offset) >>
-			 root->fs_info->sb->s_blocksize_bits;
-		offset *= csum_size;
+			offset = (start - key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+			offset *= csum_size;
 
-		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				      struct btrfs_csum_item);
-		while (size > 0) {
-			read_extent_buffer(path->nodes[0], &sector_sum->sum,
-					   ((unsigned long)item) + offset,
-					   csum_size);
-			sector_sum->bytenr = start;
-
-			size -= root->sectorsize;
-			start += root->sectorsize;
-			offset += csum_size;
-			sector_sum++;
+			while (size > 0) {
+				read_extent_buffer(path->nodes[0],
+						&sector_sum->sum,
+						((unsigned long)item) +
+						offset, csum_size);
+				sector_sum->bytenr = start;
+
+				size -= root->sectorsize;
+				start += root->sectorsize;
+				offset += csum_size;
+				sector_sum++;
+			}
+			list_add_tail(&sums->list, list);
 		}
-		list_add_tail(&sums->list, list);
-
 		path->slots[0]++;
 	}
 	ret = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0ca9c3723c..4e57fe68e4b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -157,7 +157,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	key.objectid = inode->i_ino;
 	key.offset = start;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	inode_add_bytes(inode, size);
 	datasize = btrfs_file_extent_calc_inline_size(cur_size);
 
 	inode_add_bytes(inode, size);
@@ -920,8 +919,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
 	struct btrfs_ordered_sum *sums;
 	LIST_HEAD(list);
 
-	ret = btrfs_lookup_csums_range(root, bytenr, bytenr + num_bytes - 1,
-				       &list);
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+				       bytenr + num_bytes - 1, &list);
 	if (ret == 0 && list_empty(&list))
 		return 0;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3a72a1b6c24..332ec35d2c0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -433,49 +433,6 @@ insert:
 						   trans->transid);
 		}
 	}
-
-	if (overwrite_root &&
-	    key->type == BTRFS_EXTENT_DATA_KEY) {
-		int extent_type;
-		struct btrfs_file_extent_item *fi;
-
-		fi = (struct btrfs_file_extent_item *)dst_ptr;
-		extent_type = btrfs_file_extent_type(path->nodes[0], fi);
-		if (extent_type == BTRFS_FILE_EXTENT_REG ||
-		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			struct btrfs_key ins;
-			ins.objectid = btrfs_file_extent_disk_bytenr(
-							path->nodes[0], fi);
-			ins.offset = btrfs_file_extent_disk_num_bytes(
-							path->nodes[0], fi);
-			ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-			/*
-			 * is this extent already allocated in the extent
-			 * allocation tree?  If so, just add a reference
-			 */
-			ret = btrfs_lookup_extent(root, ins.objectid,
-						  ins.offset);
-			if (ret == 0) {
-				ret = btrfs_inc_extent_ref(trans, root,
-						ins.objectid, ins.offset,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid);
-			} else {
-				/*
-				 * insert the extent pointer in the extent
-				 * allocation tree
-				 */
-				ret = btrfs_alloc_logged_extent(trans, root,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid,
-						&ins);
-				BUG_ON(ret);
-			}
-		}
-	}
 no_copy:
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
@@ -530,6 +487,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	u64 extent_end;
 	u64 alloc_hint;
 	u64 start = key->offset;
+	u64 saved_nbytes;
 	struct btrfs_file_extent_item *item;
 	struct inode *inode = NULL;
 	unsigned long size;
@@ -591,17 +549,95 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(root, path);
 
+	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode,
 			 start, extent_end, start, &alloc_hint);
 	BUG_ON(ret);
 
-	/* insert the extent */
-	ret = overwrite_item(trans, root, path, eb, slot, key);
-	BUG_ON(ret);
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		unsigned long dest_offset;
+		struct btrfs_key ins;
+
+		ret = btrfs_insert_empty_item(trans, root, path, key,
+					      sizeof(*item));
+		BUG_ON(ret);
+		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+						    path->slots[0]);
+		copy_extent_buffer(path->nodes[0], eb, dest_offset,
+				(unsigned long)item,  sizeof(*item));
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		if (ins.objectid > 0) {
+			u64 csum_start;
+			u64 csum_end;
+			LIST_HEAD(ordered_sums);
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_extent(root, ins.objectid,
+						ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid);
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_extent(trans, root,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						&ins);
+				BUG_ON(ret);
+			}
+			btrfs_release_path(root, path);
+
+			if (btrfs_file_extent_compression(eb, item)) {
+				csum_start = ins.objectid;
+				csum_end = csum_start + ins.offset;
+			} else {
+				csum_start = ins.objectid +
+					btrfs_file_extent_offset(eb, item);
+				csum_end = csum_start +
+					btrfs_file_extent_num_bytes(eb, item);
+			}
+
+			ret = btrfs_lookup_csums_range(root->log_root,
+						csum_start, csum_end - 1,
+						&ordered_sums);
+			BUG_ON(ret);
+			while (!list_empty(&ordered_sums)) {
+				struct btrfs_ordered_sum *sums;
+				sums = list_entry(ordered_sums.next,
+						struct btrfs_ordered_sum,
+						list);
+				ret = btrfs_csum_file_blocks(trans,
+						root->fs_info->csum_root,
+						sums);
+				BUG_ON(ret);
+				list_del(&sums->list);
+				kfree(sums);
+			}
+		} else {
+			btrfs_release_path(root, path);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
 
-	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
-	inode_add_bytes(inode, extent_end - start);
+	inode_set_bytes(inode, saved_nbytes);
 	btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
@@ -902,70 +938,6 @@ out_nowrite:
 	return 0;
 }
 
-/*
- * replay one csum item from the log tree into the subvolume 'root'
- * eb, slot and key all refer to the log tree
- * path is for temp use by this function and should be released on return
- *
- * This copies the checksums out of the log tree and inserts them into
- * the subvolume.  Any existing checksums for this range in the file
- * are overwritten, and new items are added where required.
- *
- * We keep this simple by reusing the btrfs_ordered_sum code from
- * the data=ordered mode.  This basically means making a copy
- * of all the checksums in ram, which we have to do anyway for kmap
- * rules.
- *
- * The copy is then sent down to btrfs_csum_file_blocks, which
- * does all the hard work of finding existing items in the file
- * or adding new ones.
- */
-static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct extent_buffer *eb, int slot,
-				      struct btrfs_key *key)
-{
-	int ret;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
-	u64 cur_offset;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
-	unsigned long file_bytes;
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	unsigned long ptr;
-
-	file_bytes = (item_size / csum_size) * root->sectorsize;
-	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
-	if (!sums)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&sums->list);
-	sums->len = file_bytes;
-	sums->bytenr = key->offset;
-
-	/*
-	 * copy all the sums into the ordered sum struct
-	 */
-	sector_sum = sums->sums;
-	cur_offset = key->offset;
-	ptr = btrfs_item_ptr_offset(eb, slot);
-	while (item_size > 0) {
-		sector_sum->bytenr = cur_offset;
-		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
-		sector_sum++;
-		item_size -= csum_size;
-		ptr += csum_size;
-		cur_offset += root->sectorsize;
-	}
-
-	/* let btrfs_csum_file_blocks add them into the file */
-	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
-	BUG_ON(ret);
-	kfree(sums);
-	return 0;
-}
 /*
  * There are a few corners where the link count of the file can't
  * be properly maintained during replay.  So, instead of adding
@@ -1659,10 +1631,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
 			BUG_ON(ret);
-		} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
-			ret = replay_one_csum(wc->trans, root, path,
-					      eb, i, &key);
-			BUG_ON(ret);
 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
 			   key.type == BTRFS_DIR_INDEX_KEY) {
 			ret = replay_one_dir_item(wc->trans, root, path,
@@ -2021,7 +1989,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 		.process_func = process_one_buffer
 	};
 
-	if (!root->log_root)
+	if (!root->log_root || root->fs_info->log_root_recovering)
 		return 0;
 
 	log = root->log_root;
@@ -2453,86 +2421,6 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
-				      struct list_head *list,
-				      struct btrfs_root *root,
-				      u64 disk_bytenr, u64 len)
-{
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	int ret;
-	struct btrfs_path *path;
-	struct btrfs_csum_item *item = NULL;
-	u64 end = disk_bytenr + len;
-	u64 item_start_offset = 0;
-	u64 item_last_offset = 0;
-	u32 diff;
-	u32 sum;
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
-
-	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
-
-	sector_sum = sums->sums;
-	sums->bytenr = disk_bytenr;
-	sums->len = len;
-	list_add_tail(&sums->list, list);
-
-	path = btrfs_alloc_path();
-	while (disk_bytenr < end) {
-		if (!item || disk_bytenr < item_start_offset ||
-		    disk_bytenr >= item_last_offset) {
-			struct btrfs_key found_key;
-			u32 item_size;
-
-			if (item)
-				btrfs_release_path(root, path);
-			item = btrfs_lookup_csum(NULL, root, path,
-						 disk_bytenr, 0);
-			if (IS_ERR(item)) {
-				ret = PTR_ERR(item);
-				if (ret == -ENOENT || ret == -EFBIG)
-					ret = 0;
-				sum = 0;
-				printk(KERN_INFO "log no csum found for "
-				       "byte %llu\n",
-				       (unsigned long long)disk_bytenr);
-				item = NULL;
-				btrfs_release_path(root, path);
-				goto found;
-			}
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-					      path->slots[0]);
-
-			item_start_offset = found_key.offset;
-			item_size = btrfs_item_size_nr(path->nodes[0],
-						       path->slots[0]);
-			item_last_offset = item_start_offset +
-				(item_size / csum_size) *
-				root->sectorsize;
-			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					      struct btrfs_csum_item);
-		}
-		/*
-		 * this byte range must be able to fit inside
-		 * a single leaf so it will also fit inside a u32
-		 */
-		diff = disk_bytenr - item_start_offset;
-		diff = diff / root->sectorsize;
-		diff = diff * csum_size;
-
-		read_extent_buffer(path->nodes[0], &sum,
-				   ((unsigned long)item) + diff,
-				   csum_size);
-found:
-		sector_sum->bytenr = disk_bytenr;
-		sector_sum->sum = sum;
-		disk_bytenr += root->sectorsize;
-		sector_sum++;
-	}
-	btrfs_free_path(path);
-	return 0;
-}
-
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *log,
 			       struct btrfs_path *dst_path,
@@ -2622,10 +2510,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   trans->transid,
 						   ins_keys[i].objectid);
 					BUG_ON(ret);
-					ret = copy_extent_csums(trans,
-						&ordered_sums,
-						log->fs_info->csum_root,
-						ds + cs, cl);
+					ret = btrfs_lookup_csums_range(
+						   log->fs_info->csum_root,
+						   ds + cs, ds + cs + cl - 1,
+						   &ordered_sums);
 					BUG_ON(ret);
 				}
 			}
@@ -2942,9 +2830,9 @@ again:
 		tmp_key.offset = (u64)-1;
 
 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
-
 		BUG_ON(!wc.replay_dest);
 
+		wc.replay_dest->log_root = log;
 		btrfs_record_root_in_trans(wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
@@ -2961,6 +2849,7 @@ again:
 		}
 
 		key.offset = found_key.offset - 1;
+		wc.replay_dest->log_root = NULL;
 		free_extent_buffer(log->node);
 		kfree(log);
 
-- 
cgit v1.2.3-70-g09d2


From e293e97e363e419d8a3628a927321e3f75206a0b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 Jan 2009 13:14:17 -0500
Subject: Btrfs: explicitly mark the tree log root for writeback

Each subvolume has an extent_state_tree used to mark metadata
that needs to be sent to disk while syncing the tree.  This is
used in addition to the dirty bits on the pages themselves so that
a single subvolume can be sent to disk efficiently in disk order.

Normally this marking happens in btrfs_alloc_free_block, which also does
special recording of dirty tree blocks for the tree log roots.

Yan Zheng noticed that when the root of the log tree is allocated, it is added
to the wrong writeback list.  The fix used here is to explicitly set
it dirty as part of tree log creation.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 332ec35d2c0..d81cda2e077 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -158,6 +158,19 @@ static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	 */
 	new_root->ref_cows = 0;
 	new_root->last_trans = trans->transid;
+
+	/*
+	 * we need to make sure the root block for this new tree
+	 * is marked as dirty in the dirty_log_pages tree.  This
+	 * is how it gets flushed down to disk at tree log commit time.
+	 *
+	 * the tree logging mutex keeps others from coming in and changing
+	 * the new_root->node, so we can safely access it here
+	 */
+	set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
+			 new_root->node->start + new_root->node->len - 1,
+			 GFP_NOFS);
+
 fail:
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2