summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c640
1 files changed, 545 insertions, 95 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34e..68c84c8c24b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
#include <linux/crc32c.h>
#include <linux/slab.h>
#include <linux/migrate.h>
+#include <asm/unaligned.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -44,6 +45,20 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
@@ -184,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
void btrfs_csum_final(u32 crc, char *result)
{
- *(__le32 *)result = ~cpu_to_le32(crc);
+ put_unaligned_le32(~crc, result);
}
/*
@@ -309,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
int num_copies = 0;
int mirror_num = 0;
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -317,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
!verify_parent_transid(io_tree, eb, parent_transid))
return ret;
+ /*
+ * This buffer's crc is fine, but its contents are corrupted, so
+ * there is no reason to read the other copies, they won't be
+ * any less wrong.
+ */
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ return ret;
+
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
if (num_copies == 1)
@@ -345,14 +369,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
+ if (page->private == EXTENT_PAGE_PRIVATE) {
+ WARN_ON(1);
goto out;
- if (!page->private)
+ }
+ if (!page->private) {
+ WARN_ON(1);
goto out;
+ }
len = page->private >> 2;
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ if (eb == NULL) {
+ WARN_ON(1);
+ goto out;
+ }
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
@@ -397,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
return ret;
}
+#define CORRUPT(reason, eb, root, slot) \
+ printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+ "root=%llu, slot=%d\n", reason, \
+ (unsigned long long)btrfs_header_bytenr(eb), \
+ (unsigned long long)root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_key leaf_key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (nritems == 0)
+ return 0;
+
+ /* Check the 0 item */
+ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("invalid item offset size pair", leaf, root, 0);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure each items keys are in the correct order and their
+ * offsets make sense. We only have to loop through nritems-1 because
+ * we check the current slot against the next slot, which verifies the
+ * next slot's offset+size makes sense and that the current's slot
+ * offset is correct.
+ */
+ for (slot = 0; slot < nritems - 1; slot++) {
+ btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+ CORRUPT("bad key order", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (btrfs_item_offset_nr(leaf, slot) !=
+ btrfs_item_end_nr(leaf, slot + 1)) {
+ CORRUPT("slot offset bad", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just incase all the items are consistent to eachother, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("slot end outside of leaf", leaf, root, slot);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
{
@@ -427,6 +526,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ if (eb == NULL) {
+ ret = -EIO;
+ goto out;
+ }
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
@@ -459,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
btrfs_set_buffer_lockdep_class(eb, found_level);
ret = csum_tree_block(root, eb, 1);
- if (ret)
+ if (ret) {
+ ret = -EIO;
+ goto err;
+ }
+
+ /*
+ * If this is a leaf block and it is corrupt, set the corrupt bit so
+ * that we don't try and read the other copies of this block, just
+ * return -EIO.
+ */
+ if (found_level == 0 && check_leaf(root, eb)) {
+ set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
+ }
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
@@ -821,7 +936,6 @@ static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
- .sync_page = block_sync_page,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
@@ -1134,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root, fs_info, location->objectid);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ kfree(root);
+ return ERR_PTR(-ENOMEM);
+ }
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
if (ret == 0) {
l = path->nodes[0];
@@ -1145,6 +1262,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
}
btrfs_free_path(path);
if (ret) {
+ kfree(root);
if (ret > 0)
ret = -ENOENT;
return ERR_PTR(ret);
@@ -1157,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
out:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
+ btrfs_check_and_init_root_item(&root->root_item);
+ }
return root;
}
@@ -1304,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
}
/*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
- struct btrfs_device *device;
- struct btrfs_fs_info *info;
-
- info = (struct btrfs_fs_info *)bdi->unplug_io_data;
- list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
-
- bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi->unplug_io_fn)
- bdi->unplug_io_fn(bdi, page);
- }
-}
-
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
- struct inode *inode;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct address_space *mapping;
- u64 offset;
-
- /* the generic O_DIRECT read code does this */
- if (1 || !page) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- /*
- * page->mapping may change at any time. Get a consistent copy
- * and use that for everything below
- */
- smp_mb();
- mapping = page->mapping;
- if (!mapping)
- return;
-
- inode = mapping->host;
-
- /*
- * don't do the expensive searching for a small number of
- * devices
- */
- if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- offset = page_offset(page);
-
- em_tree = &BTRFS_I(inode)->extent_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
- read_unlock(&em_tree->lock);
- if (!em) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- __unplug_io_fn(bdi, page);
- return;
- }
- offset = offset - em->start;
- btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
- em->block_start + offset, page);
- free_extent_map(em);
-}
-
-/*
* If this fails, caller must call bdi_destroy() to get rid of the
* bdi again.
*/
@@ -1393,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return err;
bdi->ra_pages = default_backing_dev_info.ra_pages;
- bdi->unplug_io_fn = btrfs_unplug_io_fn;
- bdi->unplug_io_data = info;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
return 0;
@@ -1527,6 +1569,7 @@ static int transaction_kthread(void *arg)
spin_unlock(&root->fs_info->new_trans_lock);
trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
if (transid == trans->transid) {
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
@@ -1604,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_bdi;
}
+ fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1713,8 +1758,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh)
+ if (!bh) {
+ err = -EINVAL;
goto fail_iput;
+ }
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1774,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!btrfs_super_root(disk_super))
goto fail_iput;
+ /* check FS state, whether FS is broken. */
+ fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+ btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
+ /*
+ * In the long term, we'll store the compression type in the super
+ * block, and it'll be used for per file compression control.
+ */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
ret = btrfs_parse_options(tree_root, options);
if (ret) {
err = ret;
@@ -1744,10 +1802,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
features = btrfs_super_incompat_flags(disk_super);
- if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- btrfs_set_super_incompat_flags(disk_super, features);
- }
+ features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+ if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+ features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1932,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_alloc_profile = (u64)-1;
fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+ ret = btrfs_init_space_info(fs_info);
+ if (ret) {
+ printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+ goto fail_block_groups;
+ }
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -1957,7 +2021,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_set_opt(fs_info->mount_opt, SSD);
}
- if (btrfs_super_log_root(disk_super) != 0) {
+ /* do not make disk changes in broken FS */
+ if (btrfs_super_log_root(disk_super) != 0 &&
+ !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
@@ -2021,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!(sb->s_flags & MS_RDONLY)) {
down_read(&fs_info->cleanup_work_sem);
- btrfs_orphan_cleanup(fs_info->fs_root);
- btrfs_orphan_cleanup(fs_info->tree_root);
+ err = btrfs_orphan_cleanup(fs_info->fs_root);
+ if (!err)
+ err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+ if (err) {
+ close_ctree(tree_root);
+ return ERR_PTR(err);
+ }
}
return tree_root;
@@ -2398,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
+ int err;
+
root_objectid = gang[i]->root_key.objectid;
- btrfs_orphan_cleanup(gang[i]);
+ err = btrfs_orphan_cleanup(gang[i]);
+ if (err)
+ return err;
}
root_objectid++;
}
@@ -2421,10 +2496,14 @@ int btrfs_commit_super(struct btrfs_root *root)
up_write(&root->fs_info->cleanup_work_sem);
trans = btrfs_join_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
/* run commit again to drop the original snapshot */
trans = btrfs_join_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_commit_transaction(trans, root);
ret = btrfs_write_and_wait_transaction(NULL, root);
BUG_ON(ret);
@@ -2442,8 +2521,28 @@ int close_ctree(struct btrfs_root *root)
smp_mb();
btrfs_put_block_group_cache(fs_info);
+
+ /*
+ * Here come 2 situations when btrfs is broken to flip readonly:
+ *
+ * 1. when btrfs flips readonly somewhere else before
+ * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+ * and btrfs will skip to write sb directly to keep
+ * ERROR state on disk.
+ *
+ * 2. when btrfs flips readonly just in btrfs_commit_super,
+ * and in such case, btrfs cannot write sb via btrfs_commit_super,
+ * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+ * btrfs will cleanup all FS resources first and write sb then.
+ */
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
- ret = btrfs_commit_super(root);
+ ret = btrfs_commit_super(root);
+ if (ret)
+ printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+ }
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ ret = btrfs_error_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
@@ -2502,6 +2601,8 @@ int close_ctree(struct btrfs_root *root)
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
+ kfree(fs_info);
+
return 0;
}
@@ -2619,6 +2720,355 @@ out:
return 0;
}
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only)
+{
+ if (read_only)
+ return;
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ printk(KERN_WARNING "warning: mount fs with errors, "
+ "running btrfsck is recommended\n");
+}
+
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+ int ret;
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
+ /* cleanup FS via transaction */
+ btrfs_cleanup_transaction(root);
+
+ ret = write_ctree_super(NULL, root, 0);
+
+ return ret;
+}
+
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+ }
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+}
+
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+ struct list_head splice;
+ struct btrfs_ordered_extent *ordered;
+ struct inode *inode;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+
+ list_splice_init(&root->fs_info->ordered_extents, &splice);
+ while (!list_empty(&splice)) {
+ ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+ root_extent_list);
+
+ list_del_init(&ordered->root_extent_list);
+ atomic_inc(&ordered->refs);
+
+ /* the inode may be getting freed (in sys_unlink path). */
+ inode = igrab(ordered->inode);
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ if (inode)
+ iput(inode);
+
+ atomic_set(&ordered->refs, 1);
+ btrfs_put_ordered_extent(ordered);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ int ret = 0;
+
+ delayed_refs = &trans->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ if (delayed_refs->num_entries == 0) {
+ printk(KERN_INFO "delayed_refs has NO entry\n");
+ return ret;
+ }
+
+ node = rb_first(&delayed_refs->root);
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+
+ atomic_set(&ref->refs, 1);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ mutex_lock(&head->mutex);
+ kfree(head->extent_op);
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+ list_del_init(&head->cluster);
+ mutex_unlock(&head->mutex);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_delayed_ref(ref);
+
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+
+ return ret;
+}
+
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+ struct btrfs_pending_snapshot *snapshot;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ list_splice_init(&t->pending_snapshots, &splice);
+
+ while (!list_empty(&splice)) {
+ snapshot = list_entry(splice.next,
+ struct btrfs_pending_snapshot,
+ list);
+
+ list_del_init(&snapshot->list);
+
+ kfree(snapshot);
+ }
+
+ return 0;
+}
+
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+ spin_lock(&root->fs_info->delalloc_lock);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ delalloc_inodes);
+
+ list_del_init(&btrfs_inode->delalloc_inodes);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+ }
+
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ return 0;
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark)
+{
+ int ret;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+ u64 start = 0;
+ u64 end;
+ u64 offset;
+ unsigned long index;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark);
+ if (ret)
+ break;
+
+ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+ offset = page_offset(page);
+
+ spin_lock(&dirty_pages->buffer_lock);
+ eb = radix_tree_lookup(
+ &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+ offset >> PAGE_CACHE_SHIFT);
+ spin_unlock(&dirty_pages->buffer_lock);
+ if (eb) {
+ ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+ &eb->bflags);
+ atomic_set(&eb->refs, 1);
+ }
+ if (PageWriteback(page))
+ end_page_writeback(page);
+
+ lock_page(page);
+ if (PageDirty(page)) {
+ clear_page_dirty_for_io(page);
+ spin_lock_irq(&page->mapping->tree_lock);
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irq(&page->mapping->tree_lock);
+ }
+
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ }
+ }
+
+ return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents)
+{
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
+ unpin = pinned_extents;
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ /* opt_discard */
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_error_discard_extent(root, start,
+ end + 1 - start,
+ NULL);
+
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ btrfs_error_unpin_extent_range(root, start, end);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+ struct btrfs_transaction *t;
+ LIST_HEAD(list);
+
+ WARN_ON(1);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ list_splice_init(&root->fs_info->trans_list, &list);
+ while (!list_empty(&list)) {
+ t = list_entry(list.next, struct btrfs_transaction, list);
+ if (!t)
+ break;
+
+ btrfs_destroy_ordered_operations(root);
+
+ btrfs_destroy_ordered_extents(root);
+
+ btrfs_destroy_delayed_refs(t, root);
+
+ btrfs_block_rsv_release(root,
+ &root->fs_info->trans_block_rsv,
+ t->dirty_pages.dirty_bytes);
+
+ /* FIXME: cleanup wait for commit */
+ t->in_commit = 1;
+ t->blocked = 1;
+ if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ t->blocked = 0;
+ if (waitqueue_active(&root->fs_info->transaction_wait))
+ wake_up(&root->fs_info->transaction_wait);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ t->commit_done = 1;
+ if (waitqueue_active(&t->commit_wait))
+ wake_up(&t->commit_wait);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+
+ btrfs_destroy_pending_snapshots(t);
+
+ btrfs_destroy_delalloc_inodes(root);
+
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->new_trans_lock);
+
+ btrfs_destroy_marked_extents(root, &t->dirty_pages,
+ EXTENT_DIRTY);
+
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
+
+ atomic_set(&t->use_count, 0);
+ list_del_init(&t->list);
+ memset(t, 0, sizeof(*t));
+ kmem_cache_free(btrfs_transaction_cachep, t);
+ }
+
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ return 0;
+}
+
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,