From 66d8f3dd1c87813d7f1cf8b774cb03e9b8d7e87e Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:02:28 -0600
Subject: Btrfs: add a new "type" field into the block reservation structure

Sometimes we need choose the method of the reservation according to the type
of the block reservation, such as the reservation for the delayed inode update.
Now we identify the type just by comparing the address of the reservation
variants, it is very ugly if it is a temporary one because we need compare it
with all the common reservation variants. So we add a new "type" field to keep
the type the reservation variants.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2e..0dcfb998229 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2014,12 +2014,15 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	btrfs_init_block_rsv(&fs_info->global_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv,
+			     BTRFS_BLOCK_RSV_GLOBAL);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+			     BTRFS_BLOCK_RSV_DELALLOC);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+			     BTRFS_BLOCK_RSV_DELOPS);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
-- 
cgit v1.2.3-70-g09d2


From 2ecb79239bcd04c9d410f4cdce16adb6840b19da Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:27 -0600
Subject: Btrfs: fix unprotected ->log_batch

We forget to protect ->log_batch when syncing a file, this patch fix
this problem by atomic operation. And ->log_batch is used to check
if there are parallel sync operations or not, so it is unnecessary to
reset it to 0 after the sync operation of the current log tree complete.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h    |  2 +-
 fs/btrfs/disk-io.c  |  2 +-
 fs/btrfs/file.c     |  4 ++--
 fs/btrfs/tree-log.c | 12 +++++-------
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2990a7ea624..6923b9e4f90 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1491,9 +1491,9 @@ struct btrfs_root {
 	wait_queue_head_t log_commit_wait[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
+	atomic_t log_batch;
 	unsigned long log_transid;
 	unsigned long last_log_commit;
-	unsigned long log_batch;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0dcfb998229..34717ac2416 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1168,8 +1168,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
+	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
-	root->log_batch = 0;
 	root->log_transid = 0;
 	root->last_log_commit = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a9d7815cf58..793bc89c660 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1551,9 +1551,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * ordered range does a filemape_write_and_wait_range which is why we
 	 * don't do it above like other file systems.
 	 */
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	btrfs_wait_ordered_range(inode, start, end);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 
 	/*
 	 * check the transaction that last modified this inode
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f9b0fc91166..038a5229404 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -147,7 +147,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			root->log_multiple_pids = true;
 		}
 
-		root->log_batch++;
+		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
 		mutex_unlock(&root->log_mutex);
 		return 0;
@@ -166,7 +166,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			err = ret;
 	}
 	mutex_unlock(&root->fs_info->tree_log_mutex);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
 	mutex_unlock(&root->log_mutex);
 	return err;
@@ -2036,7 +2036,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
 	while (1) {
-		unsigned long batch = root->log_batch;
+		int batch = atomic_read(&root->log_batch);
 		/* when we're on an ssd, just kick the log commit out */
 		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
 			mutex_unlock(&root->log_mutex);
@@ -2044,7 +2044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->log_mutex);
 		}
 		wait_for_writer(trans, root);
-		if (batch == root->log_batch)
+		if (batch == atomic_read(&root->log_batch))
 			break;
 	}
 
@@ -2073,7 +2073,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	btrfs_set_root_node(&log->root_item, log->node);
 
-	root->log_batch = 0;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
@@ -2086,7 +2085,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->log_mutex);
 
 	mutex_lock(&log_root_tree->log_mutex);
-	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
 	mutex_unlock(&log_root_tree->log_mutex);
 
@@ -2156,7 +2155,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
-	log_root_tree->log_batch = 0;
 	log_root_tree->log_transid++;
 	smp_mb();
 
-- 
cgit v1.2.3-70-g09d2


From 0647d6bd16c03c928a22d1b90c4c8684c5908f08 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Fri, 7 Sep 2012 20:01:26 -0600
Subject: Btrfs: cleanup for unused ref cache stuff

As ref cache has been removed from btrfs, there is no user on
its lock and its check.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h   | 3 ---
 fs/btrfs/disk-io.c | 5 -----
 2 files changed, 8 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6923b9e4f90..305002b7bf3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1376,9 +1376,6 @@ struct btrfs_fs_info {
 	struct rb_root defrag_inodes;
 	atomic_t defrag_running;
 
-	spinlock_t ref_cache_lock;
-	u64 total_ref_cache_size;
-
 	/*
 	 * these three are in extended format (availability of single
 	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34717ac2416..8054f7ccf46 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2000,7 +2000,6 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->trans_lock);
-	spin_lock_init(&fs_info->ref_cache_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -3214,10 +3213,6 @@ int close_ctree(struct btrfs_root *root)
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       (unsigned long long)fs_info->delalloc_bytes);
 	}
-	if (fs_info->total_ref_cache_size) {
-		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-		       (unsigned long long)fs_info->total_ref_cache_size);
-	}
 
 	free_extent_buffer(fs_info->extent_root->node);
 	free_extent_buffer(fs_info->extent_root->commit_root);
-- 
cgit v1.2.3-70-g09d2


From 0433f20d436fd040013e5a419a2e3d732d618a41 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 13 Sep 2012 03:32:32 -0600
Subject: Btrfs: cleanup of error processing in btree_get_extent()

This patch simplifies a little complex error processing in
btree_get_extent().

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/disk-io.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8054f7ccf46..8db87bc53d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -222,21 +222,17 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (em) {
-			ret = 0;
-		} else {
-			em = lookup_extent_mapping(em_tree, failed_start,
-						   failed_len);
-			ret = -EIO;
+		if (!em) {
+			lookup_extent_mapping(em_tree, failed_start,
+					      failed_len);
+			em = ERR_PTR(-EIO);
 		}
 	} else if (ret) {
 		free_extent_map(em);
-		em = NULL;
+		em = ERR_PTR(ret);
 	}
 	write_unlock(&em_tree->lock);
 
-	if (ret)
-		em = ERR_PTR(ret);
 out:
 	return em;
 }
-- 
cgit v1.2.3-70-g09d2


From b4f359ab065b9c6c132124557d23e90e2e8504be Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 13 Sep 2012 03:32:54 -0600
Subject: Btrfs: remove unnecessary code in btree_get_extent()

Unnecessary lookup_extent_mapping() is removed because an error is
returned to the caller.
This patch was made based on the advice from Stefan Behrens, thanks.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/disk-io.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8db87bc53d2..8d633e3e391 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -217,16 +217,10 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
-		u64 failed_start = em->start;
-		u64 failed_len = em->len;
-
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em) {
-			lookup_extent_mapping(em_tree, failed_start,
-					      failed_len);
+		if (!em)
 			em = ERR_PTR(-EIO);
-		}
 	} else if (ret) {
 		free_extent_map(em);
 		em = ERR_PTR(ret);
-- 
cgit v1.2.3-70-g09d2


From 2e90cf858fd2639eae2fb3f2173f7633cb15bf38 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:06 -0600
Subject: Btrfs: cleanup fs_info->hashers

fs_info->hashers is now an obsolete one.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h   | 1 -
 fs/btrfs/disk-io.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d66dc1c6a40..1372057e1ec 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1253,7 +1253,6 @@ struct btrfs_fs_info {
 	struct mutex reloc_mutex;
 
 	struct list_head trans_list;
-	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head caching_block_groups;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d633e3e391..5d3f813e13a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1984,7 +1984,6 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
-	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
-- 
cgit v1.2.3-70-g09d2


From 892951a92ebf8b390815b24f9418f297a5081898 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:51:53 -0400
Subject: Btrfs: remove unused write cache pages hook

The btree inode has it's own write cache pages so we can remove this write
cache pages hook as it's not used.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/disk-io.c | 47 -----------------------------------------------
 1 file changed, 47 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5d3f813e13a..2c18a4eb4d5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3347,52 +3347,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-int btree_lock_page_hook(struct page *page, void *data,
-				void (*flush_fn)(void *))
-{
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_buffer *eb;
-
-	/*
-	 * We culled this eb but the page is still hanging out on the mapping,
-	 * carry on.
-	 */
-	if (!PagePrivate(page))
-		goto out;
-
-	eb = (struct extent_buffer *)page->private;
-	if (!eb) {
-		WARN_ON(1);
-		goto out;
-	}
-	if (page != eb->pages[0])
-		goto out;
-
-	if (!btrfs_try_tree_write_lock(eb)) {
-		flush_fn(data);
-		btrfs_tree_lock(eb);
-	}
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		if (root->fs_info->dirty_metadata_bytes >= eb->len)
-			root->fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-
-	btrfs_tree_unlock(eb);
-out:
-	if (!trylock_page(page)) {
-		flush_fn(data);
-		lock_page(page);
-	}
-	return 0;
-}
-
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
@@ -3787,7 +3741,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
-- 
cgit v1.2.3-70-g09d2


From 354aa0fb6d5b97b262e056f7ad7bfc88d7ce0004 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Sep 2012 01:54:00 -0600
Subject: Btrfs: fix orphan transaction on the freezed filesystem

With the following debug patch:

 static int btrfs_freeze(struct super_block *sb)
 {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_transaction *trans;
+
+	spin_lock(&fs_info->trans_lock);
+	trans = fs_info->running_transaction;
+	if (trans) {
+		printk("Transid %llu, use_count %d, num_writer %d\n",
+			trans->transid, atomic_read(&trans->use_count),
+			atomic_read(&trans->num_writers));
+	}
+	spin_unlock(&fs_info->trans_lock);
 	return 0;
 }

I found there was a orphan transaction after the freeze operation was done.

It is because the transaction may not be committed when the transaction handle
end even though it is the last handle of the current transaction. This design
avoid committing the transaction frequently, but also introduce the above
problem.

So I add btrfs_attach_transaction() which can catch the current transaction
and commit it. If there is no transaction, it will return ENOENT, and do not
anything.

This function also can be used to instead of btrfs_join_transaction_freeze()
because it don't increase the writer counter and don't start a new transaction,
so it also can fix the deadlock between sync and freeze.

Besides that, it is used to instead of btrfs_join_transaction() in
transaction_kthread(), because if there is no transaction, the transaction
kthread needn't anything.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c     |  5 +++--
 fs/btrfs/super.c       | 18 ++++++++++++++----
 fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++++---------------
 fs/btrfs/transaction.h |  4 ++--
 4 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2c18a4eb4d5..dcaf55695e6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1657,9 +1657,10 @@ static int transaction_kthread(void *arg)
 		spin_unlock(&root->fs_info->trans_lock);
 
 		/* If the file system is aborted, this will always fail. */
-		trans = btrfs_join_transaction(root);
+		trans = btrfs_attach_transaction(root);
 		if (IS_ERR(trans)) {
-			cannot_commit = true;
+			if (PTR_ERR(trans) != -ENOENT)
+				cannot_commit = true;
 			goto sleep;
 		}
 		if (transid == trans->transid) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fb260bbf59c..f8b803f30ed 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -854,10 +854,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	btrfs_wait_ordered_extents(root, 0);
 
-	trans = btrfs_join_transaction_freeze(root);
+	trans = btrfs_attach_transaction(root);
 	if (IS_ERR(trans)) {
-		/* Frozen, don't bother */
-		if (PTR_ERR(trans) == -EPERM)
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
 			return 0;
 		return PTR_ERR(trans);
 	}
@@ -1511,7 +1511,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	return 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+	trans = btrfs_attach_transaction(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
+		return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d0a2b7e4938..69139a356f7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, int type)
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
 	}
 
 	if (fs_info->trans_no_join) {
-		if (!nofail) {
+		/* 
+		 * If we are JOIN_NOLOCK we're already committing a current
+		 * transaction, we just need a handle to deal with something
+		 * when committing the transaction, such as inode cache and
+		 * space cache. It is a special case.
+		 */
+		if (type != TRANS_JOIN_NOLOCK) {
 			spin_unlock(&fs_info->trans_lock);
 			return -EBUSY;
 		}
@@ -87,6 +93,13 @@ loop:
 	}
 	spin_unlock(&fs_info->trans_lock);
 
+	/*
+	 * If we are ATTACH, we just want to catch the current transaction,
+	 * and commit it. If there is no transaction, just return ENOENT.
+	 */
+	if (type == TRANS_ATTACH)
+		return -ENOENT;
+
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
@@ -340,27 +353,28 @@ again:
 	 * because we're already holding a ref.  We need this because we could
 	 * have raced in and did an fsync() on a file which can kick a commit
 	 * and then we deadlock with somebody doing a freeze.
+	 *
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
 	 */
-	if (type != TRANS_JOIN_NOLOCK &&
-	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
-		if (type == TRANS_JOIN_FREEZE) {
-			kmem_cache_free(btrfs_trans_handle_cachep, h);
-			return ERR_PTR(-EPERM);
-		}
+	if (type < TRANS_JOIN_NOLOCK)
 		sb_start_intwrite(root->fs_info->sb);
-	}
 
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
 	do {
-		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+		ret = join_transaction(root, type);
 		if (ret == -EBUSY)
 			wait_current_trans(root);
 	} while (ret == -EBUSY);
 
 	if (ret < 0) {
-		sb_end_intwrite(root->fs_info->sb);
+		/* We must get the transaction if we are JOIN_NOLOCK. */
+		BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+		if (type < TRANS_JOIN_NOLOCK)
+			sb_end_intwrite(root->fs_info->sb);
 		kmem_cache_free(btrfs_trans_handle_cachep, h);
 		return ERR_PTR(ret);
 	}
@@ -432,9 +446,9 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
-struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root)
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0);
+	return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -605,7 +619,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	if (lock)
+	if (trans->type < TRANS_JOIN_NOLOCK)
 		sb_end_intwrite(root->fs_info->sb);
 
 	WARN_ON(cur_trans != info->running_transaction);
@@ -1678,7 +1692,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-	sb_end_intwrite(root->fs_info->sb);
+	if (trans->type < TRANS_JOIN_NOLOCK)
+		sb_end_intwrite(root->fs_info->sb);
 
 	trace_btrfs_transaction_commit(root);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0630bd19396..80961947a6b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -52,7 +52,7 @@ enum btrfs_trans_type {
 	TRANS_JOIN,
 	TRANS_USERSPACE,
 	TRANS_JOIN_NOLOCK,
-	TRANS_JOIN_FREEZE,
+	TRANS_ATTACH,
 };
 
 struct btrfs_trans_handle {
@@ -109,7 +109,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush(
 					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3-70-g09d2


From de0022b9da616b95ea5b41eab32da825b0b5150f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 14:25:58 -0400
Subject: Btrfs: do not async metadata csumming in certain situations

There are a coule scenarios where farming metadata csumming off to an async
thread doesn't help.  The first is if our processor supports crc32c, in
which case the csumming will be fast and so the overhead of the async model
is not worth the cost.  The other case is for our tree log.  We will be
making that stuff dirty and writing it out and waiting for it immediately.
Even with software crc32c this gives me a ~15% increase in speed with O_SYNC
workloads.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/disk-io.c   | 22 ++++++++++++++++++++++
 fs/btrfs/extent_io.c | 14 ++++++++++++--
 fs/btrfs/extent_io.h |  1 +
 3 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dcaf55695e6..aa02eab8c40 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -859,10 +863,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+	if (bio_flags & EXTENT_BIO_TREE_LOG)
+		return 0;
+#ifdef CONFIG_X86
+	if (cpu_has_xmm4_2)
+		return 0;
+#endif
+	return 1;
+}
+
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int async = check_async_write(inode, bio_flags);
 	int ret;
 
 	if (!(rw & REQ_WRITE)) {
@@ -877,6 +893,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			return ret;
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
+	} else if (!async) {
+		ret = btree_csum_one_bio(bio);
+		if (ret)
+			return ret;
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
 	}
 
 	/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a2c21570adf..979fa0d6bfe 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+	unsigned long bio_flags;
 
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
@@ -3163,12 +3164,16 @@ static int write_one_eb(struct extent_buffer *eb,
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
+	unsigned long bio_flags = 0;
 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
 	atomic_set(&eb->io_pages, num_pages);
+	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+		bio_flags = EXTENT_BIO_TREE_LOG;
+
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = extent_buffer_page(eb, i);
 
@@ -3177,7 +3182,8 @@ static int write_one_eb(struct extent_buffer *eb,
 		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
 					 -1, end_bio_extent_buffer_writepage,
-					 0, 0, 0);
+					 0, epd->bio_flags, bio_flags);
+		epd->bio_flags = bio_flags;
 		if (ret) {
 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 			SetPageError(p);
@@ -3212,6 +3218,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 		.tree = tree,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	int ret = 0;
 	int done = 0;
@@ -3474,7 +3481,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 		if (epd->sync_io)
 			rw = WRITE_SYNC;
 
-		ret = submit_one_bio(rw, epd->bio, 0, 0);
+		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
 		BUG_ON(ret < 0); /* -ENOMEM */
 		epd->bio = NULL;
 	}
@@ -3497,6 +3504,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
@@ -3521,6 +3529,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
@@ -3560,6 +3569,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 512f8da041f..a69dea21904 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
-- 
cgit v1.2.3-70-g09d2


From e6138876ad8327250d77291b3262fee356267211 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 27 Sep 2012 17:07:30 -0400
Subject: Btrfs: cache extent state when writing out dirty metadata pages

Everytime we write out dirty pages we search for an offset in the tree,
convert the bits in the state, and then when we wait we search for the
offset again and clear the bits.  So for every dirty range in the io tree we
are doing 4 rb searches, which is suboptimal.  With this patch we are only
doing 2 searches for every cycle (modulo weird things happening).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/disk-io.c          |  4 ++--
 fs/btrfs/extent-tree.c      |  5 +++--
 fs/btrfs/extent_io.c        | 43 +++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/extent_io.h        |  6 ++++--
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/relocation.c       |  2 +-
 fs/btrfs/transaction.c      | 14 +++++++++-----
 fs/btrfs/tree-log.c         |  3 ++-
 8 files changed, 63 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aa02eab8c40..c69995556f6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3572,7 +3572,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
+					    mark, NULL);
 		if (ret)
 			break;
 
@@ -3627,7 +3627,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 again:
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3270b108785..ca4aad96f81 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	while (start < end) {
 		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY | EXTENT_UPTODATE);
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
 		if (ret)
 			break;
 
@@ -5045,7 +5046,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 979fa0d6bfe..e8ee39b7335 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -937,6 +937,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * @end:	the end offset in bytes (inclusive)
  * @bits:	the bits to set in this range
  * @clear_bits:	the bits to clear in this range
+ * @cached_state:	state that we're going to cache
  * @mask:	the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
@@ -946,7 +947,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask)
+		       int bits, int clear_bits,
+		       struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -963,6 +965,15 @@ again:
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -993,6 +1004,7 @@ hit_next:
 	 */
 	if (state->start == start && state->end <= end) {
 		set_state_bits(tree, state, &bits);
+		cache_state(state, cached_state);
 		state = clear_state_bit(tree, state, &clear_bits, 0);
 		if (last_end == (u64)-1)
 			goto out;
@@ -1033,6 +1045,7 @@ hit_next:
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, &bits);
+			cache_state(state, cached_state);
 			state = clear_state_bit(tree, state, &clear_bits, 0);
 			if (last_end == (u64)-1)
 				goto out;
@@ -1071,6 +1084,7 @@ hit_next:
 				   &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
@@ -1093,6 +1107,7 @@ hit_next:
 			extent_io_tree_panic(tree, err);
 
 		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
 		clear_state_bit(tree, prealloc, &clear_bits, 0);
 		prealloc = NULL;
 		goto out;
@@ -1297,18 +1312,42 @@ out:
  * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
+			  u64 *start_ret, u64 *end_ret, int bits,
+			  struct extent_state **cached_state)
 {
 	struct extent_state *state;
+	struct rb_node *n;
 	int ret = 1;
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->end == start - 1 && state->tree) {
+			n = rb_next(&state->rb_node);
+			while (n) {
+				state = rb_entry(n, struct extent_state,
+						 rb_node);
+				if (state->state & bits)
+					goto got_it;
+				n = rb_next(n);
+			}
+			free_extent_state(*cached_state);
+			*cached_state = NULL;
+			goto out;
+		}
+		free_extent_state(*cached_state);
+		*cached_state = NULL;
+	}
+
 	state = find_first_extent_bit_state(tree, start, bits);
+got_it:
 	if (state) {
+		cache_state(state, cached_state);
 		*start_ret = state->start;
 		*end_ret = state->end;
 		ret = 0;
 	}
+out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a69dea21904..7aeb31087f8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -233,13 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask);
+		       int bits, int clear_bits,
+		       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
 		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
+			  u64 *start_ret, u64 *end_ret, int bits,
+			  struct extent_state **cached_state);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 						 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b107e68797f..1027b854b90 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			       block_group->key.offset)) {
 		ret = find_first_extent_bit(unpin, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret) {
 			ret = 0;
 			break;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 6e530bb86c9..776f0aa128f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3621,7 +3621,7 @@ next:
 
 		ret = find_first_extent_bit(&rc->processed_blocks,
 					    key.objectid, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 
 		if (ret == 0 && start <= key.objectid) {
 			btrfs_release_path(path);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 69139a356f7..77db875b511 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      mark)) {
-		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-				   GFP_NOFS);
+				      mark, &cached_state)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				   mark, &cached_state, GFP_NOFS);
+		cached_state = NULL;
 		err = filemap_fdatawrite_range(mapping, start, end);
 		if (err)
 			werr = err;
@@ -717,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      EXTENT_NEED_WAIT)) {
-		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+				      EXTENT_NEED_WAIT, &cached_state)) {
+		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				 0, 0, &cached_state, GFP_NOFS);
 		err = filemap_fdatawait_range(mapping, start, end);
 		if (err)
 			werr = err;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1d7b3484432..f4b9e54b1da 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2463,7 +2463,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				NULL);
 		if (ret)
 			break;
 
-- 
cgit v1.2.3-70-g09d2


From 5af3e8cce8b7ba0a2819e18c9146c8c0b452d479 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Wed, 1 Aug 2012 18:56:49 +0200
Subject: Btrfs: make filesystem read-only when submitting barrier fails

So far the return code of barrier_all_devices() is ignored, which
means that errors are ignored. The result can be a corrupt
filesystem which is not consistent.
This commit adds code to evaluate the return code of
barrier_all_devices(). The normal btrfs_error() mechanism is used to
switch the filesystem into read-only mode when errors are detected.

In order to decide whether barrier_all_devices() should return
error or success, the number of disks that are allowed to fail the
barrier submission is calculated. This calculation accounts for the
worst RAID level of metadata, system and data. If single, dup or
RAID0 is in use, a single disk error is already considered to be
fatal. Otherwise a single disk error is tolerated.

The calculation of the number of disks that are tolerated to fail
the barrier operation is performed when the filesystem gets mounted,
when a balance operation is started and finished, and when devices
are added or removed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h    |   5 +++
 fs/btrfs/disk-io.c  | 109 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/disk-io.h  |   2 +
 fs/btrfs/ioctl.c    |   8 ++--
 fs/btrfs/tree-log.c |   7 +++-
 fs/btrfs/volumes.c  |  30 +++++++++++++++
 6 files changed, 142 insertions(+), 19 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50dcd0fbae1..1630be83121 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1468,6 +1468,8 @@ struct btrfs_fs_info {
 
 	/* next backup root to be overwritten */
 	int backup_root_index;
+
+	int num_tolerated_disk_barrier_failures;
 };
 
 /*
@@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space);
+
 /* file.c */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c69995556f6..83552368770 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2505,6 +2505,8 @@ retry_root_backup:
 		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
 		goto fail_block_groups;
 	}
+	fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
@@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 			printk_in_rcu("btrfs: disabling barriers on dev %s\n",
 				      rcu_str_deref(device->name));
 			device->nobarriers = 1;
-		}
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
+		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
-			if (!bio_flagged(bio, BIO_EOPNOTSUPP))
-				btrfs_dev_stat_inc_and_print(device,
-					BTRFS_DEV_STAT_FLUSH_ERRS);
+			btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
 
 		/* drop the reference from the wait == 0 run */
@@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
-	int errors = 0;
+	int errors_send = 0;
+	int errors_wait = 0;
 	int ret;
 
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_send++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 		ret = write_dev_flush(dev, 0);
 		if (ret)
-			errors++;
+			errors_send++;
 	}
 
 	/* wait for all the barriers */
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_wait++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 		ret = write_dev_flush(dev, 1);
 		if (ret)
-			errors++;
+			errors_wait++;
 	}
-	if (errors)
+	if (errors_send > info->num_tolerated_disk_barrier_failures ||
+	    errors_wait > info->num_tolerated_disk_barrier_failures)
 		return -EIO;
 	return 0;
 }
 
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_space_info *sinfo;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int i;
+	int c;
+	int num_tolerated_disk_barrier_failures =
+		(int)fs_info->fs_devices->num_devices;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		sinfo = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+			if (tmp->flags == types[i]) {
+				sinfo = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!sinfo)
+			continue;
+
+		down_read(&sinfo->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&sinfo->block_groups[c])) {
+				u64 flags;
+
+				btrfs_get_block_group_info(
+					&sinfo->block_groups[c], &space);
+				if (space.total_bytes == 0 ||
+				    space.used_bytes == 0)
+					continue;
+				flags = space.flags;
+				/*
+				 * return
+				 * 0: if dup, single or RAID0 is configured for
+				 *    any of metadata, system or data, else
+				 * 1: if RAID5 is configured, or if RAID1 or
+				 *    RAID10 is configured and only two mirrors
+				 *    are used, else
+				 * 2: if RAID6 is configured, else
+				 * num_mirrors - 1: if RAID1 or RAID10 is
+				 *                  configured and more than
+				 *                  2 mirrors are used.
+				 */
+				if (num_tolerated_disk_barrier_failures > 0 &&
+				    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+					       BTRFS_BLOCK_GROUP_RAID0)) ||
+				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+				      == 0)))
+					num_tolerated_disk_barrier_failures = 0;
+				else if (num_tolerated_disk_barrier_failures > 1
+					 &&
+					 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+						   BTRFS_BLOCK_GROUP_RAID10)))
+					num_tolerated_disk_barrier_failures = 1;
+			}
+		}
+		up_read(&sinfo->groups_sem);
+	}
+
+	return num_tolerated_disk_barrier_failures;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
@@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
 
-	if (do_barriers)
-		barrier_all_devices(root->fs_info);
+	if (do_barriers) {
+		ret = barrier_all_devices(root->fs_info);
+		if (ret) {
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			btrfs_error(root->fs_info, ret,
+				    "errors while submitting device barriers.");
+			return ret;
+		}
+	}
 
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fe..2025a9132c1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d6836af6d60..f5a2e6c4320 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	return 0;
 }
 
-static void get_block_group_info(struct list_head *groups_list,
-				 struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space)
 {
 	struct btrfs_block_group_cache *block_group;
 
@@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		down_read(&info->groups_sem);
 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
 			if (!list_empty(&info->block_groups[c])) {
-				get_block_group_info(&info->block_groups[c],
-						     &space);
+				btrfs_get_block_group_info(
+					&info->block_groups[c], &space);
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 47911fd1831..67eab2d4d8a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * in and cause problems either.
 	 */
 	btrfs_scrub_pause_super(root);
-	write_ctree_super(trans, root->fs_info->tree_root, 1);
+	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
 	btrfs_scrub_continue_super(root);
-	ret = 0;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_wake_log_root;
+	}
 
 	mutex_lock(&root->log_mutex);
 	if (root->last_log_commit < log_transid)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dfe5e3a22f5..029b903a4ae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		free_fs_devices(cur_devices);
 	}
 
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
 	/*
 	 * at this point, the device is zero sized.  We want to
 	 * remove it from the devices list and zero out the old super
@@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_clear_space_info_full(root->fs_info);
 
 	unlock_chunks(root);
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
 	ret = btrfs_commit_transaction(trans, root);
 
 	if (seeding_dev) {
@@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	}
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		int num_tolerated_disk_barrier_failures;
+		u64 target = bctl->sys.target;
+
+		num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+		if (num_tolerated_disk_barrier_failures > 0 &&
+		    (target &
+		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+			num_tolerated_disk_barrier_failures = 0;
+		else if (num_tolerated_disk_barrier_failures > 1 &&
+			 (target &
+			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+			num_tolerated_disk_barrier_failures = 1;
+
+		fs_info->num_tolerated_disk_barrier_failures =
+			num_tolerated_disk_barrier_failures;
+	}
+
 	ret = insert_balance_item(fs_info->tree_root, bctl);
 	if (ret && ret != -EEXIST)
 		goto out;
@@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		__cancel_balance(fs_info);
 	}
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
 	wake_up(&fs_info->balance_wait_q);
 
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 1037a5affc266fdfe08b2dd415d309ab2778ce6a Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Mon, 8 Oct 2012 07:26:15 -0600
Subject: Btrfs: remove repeated eb->pages check in,
 disk-io.c/csum_dirty_buffer

In csum_dirty_buffer, we first get eb from page->private.
Then we check if the page is the first page of eb. Later
we check it again. Remove the repeated check here.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
---
 fs/btrfs/disk-io.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 83552368770..7cda51995c1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -433,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 		return 0;
 	}
-	if (eb->pages[0] != page) {
-		WARN_ON(1);
-		return 0;
-	}
 	if (!PageUptodate(page)) {
 		WARN_ON(1);
 		return 0;
-- 
cgit v1.2.3-70-g09d2