diff options
Diffstat (limited to 'fs/btrfs')
33 files changed, 7956 insertions, 2649 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9adf5e4f7e9..a35eb36b32f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,25 +1,10 @@ -ifneq ($(KERNELRELEASE),) -# kbuild part of makefile obj-$(CONFIG_BTRFS_FS) := btrfs.o -btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o delayed-ref.o -else - -# Normal Makefile - -KERNELDIR := /lib/modules/`uname -r`/build -all: - $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean: - $(MAKE) -C $(KERNELDIR) M=`pwd` clean - -endif + export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7fdd184a528..603972576f0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return ERR_PTR(-EINVAL); } + /* Handle the cached NULL acl case without locking */ + acl = ACCESS_ONCE(*p_acl); + if (!acl) + return acl; + spin_lock(&inode->i_lock); - if (*p_acl != BTRFS_ACL_NOT_CACHED) - acl = posix_acl_dup(*p_acl); + acl = *p_acl; + if (acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); spin_unlock(&inode->i_lock); - if (acl) + if (acl != BTRFS_ACL_NOT_CACHED) return acl; - size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) btrfs_update_cached_acl(inode, p_acl, acl); } kfree(value); - } else if (size == -ENOENT) { + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; btrfs_update_cached_acl(inode, p_acl, acl); + } else { + acl = ERR_PTR(-EIO); } return acl; @@ -343,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -int btrfs_check_acl(struct inode *inode, int mask) -{ - return 0; -} - #endif /* CONFIG_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 51bfdfc8fcd..7f88628a1a7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -25,6 +25,7 @@ #define WORK_QUEUED_BIT 0 #define WORK_DONE_BIT 1 #define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 /* * container for the kthread task pointer and the list of pending work @@ -36,6 +37,7 @@ struct btrfs_worker_thread { /* list of struct btrfs_work that are waiting for service */ struct list_head pending; + struct list_head prio_pending; /* list of worker threads from struct btrfs_workers */ struct list_head worker_list; @@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, spin_lock_irqsave(&workers->lock, flags); - while (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } if (!test_bit(WORK_DONE_BIT, &work->flags)) break; @@ -143,8 +151,14 @@ static int worker_loop(void *arg) do { spin_lock_irq(&worker->lock); again_locked: - while (!list_empty(&worker->pending)) { - cur = worker->pending.next; + while (1) { + if (!list_empty(&worker->prio_pending)) + cur = worker->prio_pending.next; + else if (!list_empty(&worker->pending)) + cur = worker->pending.next; + else + break; + work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); @@ -163,7 +177,6 @@ again_locked: spin_lock_irq(&worker->lock); check_idle_worker(worker); - } if (freezing(current)) { worker->working = 0; @@ -178,7 +191,8 @@ again_locked: * jump_in? */ smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; /* @@ -191,7 +205,8 @@ again_locked: */ schedule_timeout(1); smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; if (kthread_should_stop()) @@ -200,7 +215,8 @@ again_locked: /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) goto again_locked; /* @@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; @@ -273,13 +290,14 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) } INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); + worker->workers = workers; worker->task = kthread_run(worker_loop, worker, "btrfs-%s-%d", workers->name, workers->num_workers + i); - worker->workers = workers; if (IS_ERR(worker->task)) { kfree(worker); ret = PTR_ERR(worker->task); @@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work) goto out; spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle @@ -422,6 +443,11 @@ out: return 0; } +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + /* * places a struct btrfs_work into the pending queue of one of the kthreads */ @@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { spin_lock_irqsave(&workers->lock, flags); - list_add_tail(&work->order_list, &workers->order_list); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } spin_unlock_irqrestore(&workers->lock, flags); } else { INIT_LIST_HEAD(&work->order_list); @@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 31be4ed8b63..1b511c109db 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -85,6 +85,7 @@ struct btrfs_workers { * of work items waiting for completion */ struct list_head order_list; + struct list_head prio_order_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; @@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b30986f00b9..acb4f351758 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -72,6 +72,9 @@ struct btrfs_inode { */ struct list_head ordered_operations; + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -154,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } - #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ab07627084f..de1e2fd3208 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (btrfs_test_flag(inode, NODATASUM)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ atomic_inc(&cb->pending_bios); - if (!btrfs_test_flag(inode, NODATASUM)) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!btrfs_test_flag(inode, NODATASUM)) + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) btrfs_lookup_bio_sums(root, inode, comp_bio, sums); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h deleted file mode 100644 index 6e1b3de3670..00000000000 --- a/fs/btrfs/crc32c.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_CRC32C__ -#define __BTRFS_CRC32C__ -#include <linux/crc32c.h> - -/* - * this file used to do more for selecting the HW version of crc32c, - * perhaps it will one day again soon. - */ -#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) -#endif - diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e5b2533b691..60a45f3a4e9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, u32 nritems; int ret = 0; int level; - struct btrfs_root *new_root; - - new_root = kmalloc(sizeof(*new_root), GFP_NOFS); - if (!new_root) - return -ENOMEM; - - memcpy(new_root, root, sizeof(*new_root)); - new_root->root_key.objectid = new_root_objectid; + struct btrfs_disk_key disk_key; WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); @@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, - new_root_objectid, trans->transid, - level, buf->start, 0); - if (IS_ERR(cow)) { - kfree(new_root); + cow = btrfs_alloc_free_block(trans, root, buf->len, 0, + new_root_objectid, &disk_key, level, + buf->start, 0); + if (IS_ERR(cow)) return PTR_ERR(cow); - } copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_owner(cow, new_root_objectid); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); - ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); - kfree(new_root); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, } /* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in refernece counted trees and tree roots + * are never shared. If a block was allocated after the last + * snapshot and the block was not allocated by tree relocation, + * we know the block is not shared. + */ + if (root->ref_cows && + buf != root->node && buf != root->commit_root && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 1; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (root->ref_cows && + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + return 1; +#endif + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow) +{ + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(root, buf)) { + ret = btrfs_lookup_extent_info(trans, root, buf->start, + buf->len, &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 1); + BUG_ON(ret); + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + } + if (new_flags != 0) { + ret = btrfs_set_disk_extent_flags(trans, root, + buf->start, + buf->len, + new_flags, 0); + BUG_ON(ret); + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 1); + BUG_ON(ret); + } + clean_tree_block(trans, root, buf); + } + return 0; +} + +/* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked * dirty and returned locked. If you modify the block it needs to be marked @@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size) { - u64 parent_start; + struct btrfs_disk_key disk_key; struct extent_buffer *cow; - u32 nritems; - int ret = 0; int level; int unlock_orig = 0; + u64 parent_start; if (*cow_ret == buf) unlock_orig = 1; btrfs_assert_tree_locked(buf); - if (parent) - parent_start = parent->start; - else - parent_start = 0; - WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); WARN_ON(root->ref_cows && trans->transid != root->last_trans); level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, root->root_key.objectid, - trans->transid, level, - search_start, empty_size); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + else + parent_start = 0; + } else + parent_start = 0; + + cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, + root->root_key.objectid, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_owner(cow, root->root_key.objectid); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (btrfs_header_generation(buf) != trans->transid) { - u32 nr_extents; - ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); - if (ret) - return ret; - - ret = btrfs_cache_ref(trans, root, buf, nr_extents); - WARN_ON(ret); - } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { - /* - * There are only two places that can drop reference to - * tree blocks owned by living reloc trees, one is here, - * the other place is btrfs_drop_subtree. In both places, - * we check reference count while tree block is locked. - * Furthermore, if reference count is one, it won't get - * increased by someone else. - */ - u32 refs; - ret = btrfs_lookup_extent_ref(trans, root, buf->start, - buf->len, &refs); - BUG_ON(ret); - if (refs == 1) { - ret = btrfs_update_ref(trans, root, buf, cow, - 0, nritems); - clean_tree_block(trans, root, buf); - } else { - ret = btrfs_inc_ref(trans, root, buf, cow, NULL); - } - BUG_ON(ret); - } else { - ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); - if (ret) - return ret; - clean_tree_block(trans, root, buf); - } - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); - WARN_ON(ret); - } + update_ref_for_cow(trans, root, buf, cow); if (buf == root->node) { WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + else + parent_start = 0; spin_lock(&root->node_lock); root->node = cow; extent_buffer_get(cow); spin_unlock(&root->node_lock); - if (buf != root->commit_root) { - btrfs_free_extent(trans, root, buf->start, - buf->len, buf->start, - root->root_key.objectid, - btrfs_header_generation(buf), - level, 1); - } + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, + level, 0); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_set_node_blockptr(parent, parent_slot, cow->start); - WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - WARN_ON(btrfs_header_generation(parent) != trans->transid); btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, btrfs_header_owner(parent), - btrfs_header_generation(parent), level, 1); + parent_start, root->root_key.objectid, + level, 0); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 0; + return 1; +} + /* * cows a single block, see __btrfs_cow_block for the real work. * This version of it has extra checks so that a block isn't cow'd more than @@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - if (btrfs_header_generation(buf) == trans->transid && - btrfs_header_owner(buf) == root->root_key.objectid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } @@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) /* * same as comp_keys only with two btrfs_key's */ -static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + return bin_search(eb, key, level, slot); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root->node = child; spin_unlock(&root->node_lock); - ret = btrfs_update_extent_ref(trans, root, child->start, - child->len, - mid->start, child->start, - root->root_key.objectid, - trans->transid, level - 1); - BUG_ON(ret); - add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); ret = btrfs_free_extent(trans, root, mid->start, mid->len, - mid->start, root->root_key.objectid, - btrfs_header_generation(mid), - level, 1); + 0, root->root_key.objectid, level, 1); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -949,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (trans->transaction->delayed_refs.flushing && - btrfs_header_nritems(mid) > 2) + if (btrfs_header_nritems(mid) > 2) return 0; if (btrfs_header_nritems(mid) < 2) @@ -998,7 +1100,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; if (btrfs_header_nritems(right) == 0) { u64 bytenr = right->start; - u64 generation = btrfs_header_generation(parent); u32 blocksize = right->len; clean_tree_block(trans, root, right); @@ -1010,9 +1111,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - generation, level, 1); + blocksize, 0, + root->root_key.objectid, + level, 0); if (wret) ret = wret; } else { @@ -1047,7 +1148,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } if (btrfs_header_nritems(mid) == 0) { /* we've managed to empty the middle node, drop it */ - u64 root_gen = btrfs_header_generation(parent); u64 bytenr = mid->start; u32 blocksize = mid->len; @@ -1059,9 +1159,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, - btrfs_header_owner(parent), - root_gen, level, 1); + 0, root->root_key.objectid, + level, 0); if (wret) ret = wret; } else { @@ -1325,12 +1424,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, int ret = 0; int blocksize; - parent = path->nodes[level - 1]; + parent = path->nodes[level + 1]; if (!parent) return 0; nritems = btrfs_header_nritems(parent); - slot = path->slots[level]; + slot = path->slots[level + 1]; blocksize = btrfs_level_size(root, level); if (slot > 0) { @@ -1341,7 +1440,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = 0; free_extent_buffer(eb); } - if (slot < nritems) { + if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); @@ -1351,7 +1450,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, } if (block1 || block2) { ret = -EAGAIN; + + /* release the whole path */ btrfs_release_path(root, path); + + /* read the blocks */ if (block1) readahead_tree_block(root, block1, blocksize, 0); if (block2) @@ -1361,7 +1464,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, eb = read_tree_block(root, block1, blocksize, 0); free_extent_buffer(eb); } - if (block1) { + if (block2) { eb = read_tree_block(root, block2, blocksize, 0); free_extent_buffer(eb); } @@ -1433,7 +1536,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks || path->lowest_level) + if (path->keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1465,6 +1568,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, u32 blocksize; struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; + int ret; blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); @@ -1472,6 +1576,10 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + /* + * we found an up to date block without sleeping, return + * right away + */ *eb_ret = tmp; return 0; } @@ -1479,18 +1587,34 @@ read_block_for_search(struct btrfs_trans_handle *trans, /* * reduce lock contention at high levels * of the btree by dropping locks before - * we read. + * we read. Don't release the lock on the current + * level because we need to walk this node to figure + * out which blocks to read. */ - btrfs_release_path(NULL, p); + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + if (tmp) free_extent_buffer(tmp); if (p->reada) reada_for_search(root, p, level, slot, key->objectid); + btrfs_release_path(NULL, p); + + ret = -EAGAIN; tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp) + if (tmp) { + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!btrfs_buffer_uptodate(tmp, 0)) + ret = -EIO; free_extent_buffer(tmp); - return -EAGAIN; + } + return ret; } /* @@ -1527,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { int sret; sret = reada_for_balance(root, p, level); @@ -1589,10 +1713,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root lowest_unlock = 2; again: - if (p->skip_locking) - b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + if (p->search_commit_root) { + b = root->commit_root; + extent_buffer_get(b); + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + } while (b) { level = btrfs_header_level(b); @@ -1613,11 +1744,9 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (btrfs_header_generation(b) == trans->transid && - btrfs_header_owner(b) == root->root_key.objectid && - !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { + if (!should_cow_block(trans, root, b)) goto cow_done; - } + btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, @@ -1689,6 +1818,9 @@ cow_done: if (ret == -EAGAIN) goto again; + if (ret == -EIO) + goto done; + if (!p->skip_locking) { int lret; @@ -1731,141 +1863,11 @@ done: */ if (!p->leave_spinning) btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(root, p); return ret; } -int btrfs_merge_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *node_keys, - u64 *nodes, int lowest_level) -{ - struct extent_buffer *eb; - struct extent_buffer *parent; - struct btrfs_key key; - u64 bytenr; - u64 generation; - u32 blocksize; - int level; - int slot; - int key_match; - int ret; - - eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); - BUG_ON(ret); - - btrfs_set_lock_blocking(eb); - - parent = eb; - while (1) { - level = btrfs_header_level(parent); - if (level == 0 || level <= lowest_level) - break; - - ret = bin_search(parent, &node_keys[lowest_level], level, - &slot); - if (ret && slot > 0) - slot--; - - bytenr = btrfs_node_blockptr(parent, slot); - if (nodes[level - 1] == bytenr) - break; - - blocksize = btrfs_level_size(root, level - 1); - generation = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(eb, &key, slot); - key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); - - if (generation == trans->transid) { - eb = read_tree_block(root, bytenr, blocksize, - generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - } - - /* - * if node keys match and node pointer hasn't been modified - * in the running transaction, we can merge the path. for - * blocks owened by reloc trees, the node pointer check is - * skipped, this is because these blocks are fully controlled - * by the space balance code, no one else can modify them. - */ - if (!nodes[level - 1] || !key_match || - (generation == trans->transid && - btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { - if (level == 1 || level == lowest_level + 1) { - if (generation == trans->transid) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - break; - } - - if (generation != trans->transid) { - eb = read_tree_block(root, bytenr, blocksize, - generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - } - - ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb); - BUG_ON(ret); - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { - if (!nodes[level - 1]) { - nodes[level - 1] = eb->start; - memcpy(&node_keys[level - 1], &key, - sizeof(node_keys[0])); - } else { - WARN_ON(1); - } - } - - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - parent = eb; - continue; - } - - btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); - btrfs_set_node_ptr_generation(parent, slot, trans->transid); - btrfs_mark_buffer_dirty(parent); - - ret = btrfs_inc_extent_ref(trans, root, - nodes[level - 1], - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - level - 1); - BUG_ON(ret); - - /* - * If the block was created in the running transaction, - * it's possible this is the last reference to it, so we - * should drop the subtree. - */ - if (generation == trans->transid) { - ret = btrfs_drop_subtree(trans, root, eb, parent); - BUG_ON(ret); - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } else { - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - level - 1, 1); - BUG_ON(ret); - } - break; - } - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - return 0; -} - /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -1991,9 +1993,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); - ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); - BUG_ON(ret); - return ret; } @@ -2053,9 +2052,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); - ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); - BUG_ON(ret); - return ret; } @@ -2075,7 +2071,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct extent_buffer *c; struct extent_buffer *old; struct btrfs_disk_key lower_key; - int ret; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); @@ -2087,16 +2082,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, trans->transid, + root->root_key.objectid, &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); - memset_extent_buffer(c, 0, 0, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); btrfs_set_header_bytenr(c, c->start); btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); write_extent_buffer(c, root->fs_info->fsid, @@ -2121,12 +2117,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, root->node = c; spin_unlock(&root->node_lock); - ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->len, lower->start, c->start, - root->root_key.objectid, - trans->transid, level - 1); - BUG_ON(ret); - /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -2203,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else if (!trans->transaction->delayed_refs.flushing) { + } else { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2214,20 +2204,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, } c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, - path->nodes[level + 1]->start, + split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - trans->transid, level, c->start, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); - btrfs_set_header_flags(split, btrfs_header_flags(c)); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); - btrfs_set_header_flags(split, 0); write_extent_buffer(split, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(split), BTRFS_FSID_SIZE); @@ -2235,7 +2226,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - mid = (c_nritems + 1) / 2; copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), @@ -2248,16 +2238,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - btrfs_node_key(split, &disk_key, 0); wret = insert_ptr(trans, root, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (wret) ret = wret; - ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); - BUG_ON(ret); - if (path->slots[level] >= mid) { path->slots[level] -= mid; btrfs_tree_unlock(c); @@ -2330,7 +2316,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 right_nritems; u32 data_end; u32 this_item_size; - int ret; if (empty) nr = 0; @@ -2443,9 +2428,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(left); btrfs_mark_buffer_dirty(right); - ret = btrfs_update_ref(trans, root, left, right, 0, push_items); - BUG_ON(ret); - btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); btrfs_mark_buffer_dirty(upper); @@ -2690,10 +2672,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); - ret = btrfs_update_ref(trans, root, right, left, - old_left_nritems, push_items); - BUG_ON(ret); - btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); if (wret) @@ -2850,9 +2828,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(l); BUG_ON(path->slots[0] != slot); - ret = btrfs_update_ref(trans, root, l, right, 0, nritems); - BUG_ON(ret); - if (mid <= slot) { btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -2881,6 +2856,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, int extend) { + struct btrfs_disk_key disk_key; struct extent_buffer *l; u32 nritems; int mid; @@ -2888,12 +2864,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *right; int ret = 0; int wret; - int double_split; + int split; int num_doubles = 0; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && - !trans->transaction->delayed_refs.flushing) { + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { wret = push_leaf_right(trans, root, path, data_size, 0); if (wret < 0) return wret; @@ -2915,16 +2890,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return ret; } again: - double_split = 0; + split = 1; l = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(l); mid = (nritems + 1) / 2; - right = btrfs_alloc_free_block(trans, root, root->leafsize, - path->nodes[1]->start, + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2; + } + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + split = 0; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2 ; + } + } + } + } + + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); + + right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - trans->transid, 0, l->start, 0); + &disk_key, 0, l->start, 0); if (IS_ERR(right)) { BUG_ON(1); return PTR_ERR(right); @@ -2933,6 +2945,7 @@ again: memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, @@ -2943,79 +2956,47 @@ again: (unsigned long)btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); - if (mid <= slot) { - if (nritems == 1 || - leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (slot >= nritems) { - struct btrfs_disk_key disk_key; - - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - path->slots[1] += 1; - btrfs_mark_buffer_dirty(right); - return ret; - } - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; - } - } - } else { - if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (!extend && data_size && slot == 0) { - struct btrfs_disk_key disk_key; - - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, - path->slots[1], 1); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); if (wret) ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } - btrfs_mark_buffer_dirty(right); - return ret; - } else if ((extend || !data_size) && slot == 0) { - mid = 1; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; - } } } + btrfs_mark_buffer_dirty(right); + return ret; } ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); BUG_ON(ret); - if (double_split) { + if (split == 2) { BUG_ON(num_doubles != 0); num_doubles++; goto again; @@ -3417,7 +3398,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, /* figure out how many keys we can insert in here */ total_data = data_size[0]; for (i = 1; i < nr; i++) { - if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) + if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) break; total_data += data_size[i]; } @@ -3715,9 +3696,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * a helper function to delete the leaf pointed to by path->slots[1] and - * path->nodes[1]. bytenr is the node block pointer, but since the callers - * already know it, it is faster to have them pass it down than to - * read it out of the node again. + * path->nodes[1]. * * This deletes the pointer in path->nodes[1] and frees the leaf * block extent. zero is returned if it all worked out, < 0 otherwise. @@ -3725,15 +3704,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 bytenr) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { int ret; - u64 root_gen = btrfs_header_generation(path->nodes[1]); - u64 parent_start = path->nodes[1]->start; - u64 parent_owner = btrfs_header_owner(path->nodes[1]); + WARN_ON(btrfs_header_generation(leaf) != trans->transid); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; @@ -3744,10 +3722,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, bytenr, - btrfs_level_size(root, 0), - parent_start, parent_owner, - root_gen, 0, 1); + ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0, 0); return ret; } /* @@ -3815,7 +3791,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - ret = btrfs_del_leaf(trans, root, path, leaf->start); + ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); } } else { @@ -3831,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && - !trans->transaction->delayed_refs.flushing) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -3854,8 +3829,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, - leaf->start); + ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); free_extent_buffer(leaf); } else { @@ -4205,6 +4179,11 @@ again: if (ret == -EAGAIN) goto again; + if (ret < 0) { + btrfs_release_path(root, path); + goto done; + } + if (!path->skip_locking) { ret = btrfs_try_spin_lock(next); if (!ret) { @@ -4239,6 +4218,11 @@ again: if (ret == -EAGAIN) goto again; + if (ret < 0) { + btrfs_release_path(root, path); + goto done; + } + if (!path->skip_locking) { btrfs_assert_tree_locked(path->nodes[level]); ret = btrfs_try_spin_lock(next); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad96495dedc..03441a99ea3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -45,6 +45,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 +#define BTRFS_COMPAT_EXTENT_TREE_V0 + /* * files bigger than this get some pre-flushing when they are added * to the ordered operations list. That way we limit the total @@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } #define BTRFS_FSID_SIZE 16 -#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 /* * every tree block (leaf or node) starts with this header. @@ -296,7 +309,6 @@ struct btrfs_header { sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) /* * this is a very generous portion of the super block, giving us @@ -355,9 +367,12 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_SUPP 0x0 -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 -#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) + +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF /* * A leaf is full of items. offset and size tell us where to find @@ -421,23 +436,65 @@ struct btrfs_path { unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int leave_spinning:1; + unsigned int search_commit_root:1; }; /* * items in the extent btree are used to record the objectid of the * owner of the block and the number of references */ + struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { __le32 refs; } __attribute__ ((__packed__)); -struct btrfs_extent_ref { +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ + sizeof(struct btrfs_item)) + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { __le64 root; __le64 generation; __le64 objectid; - __le32 num_refs; + __le32 count; } __attribute__ ((__packed__)); + /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated * the extent. The chunk tree uuid field is a way to double check the owner @@ -695,12 +752,7 @@ struct btrfs_block_group_cache { struct list_head cluster_list; }; -struct btrfs_leaf_ref_tree { - struct rb_root root; - struct list_head list; - spinlock_t lock; -}; - +struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info { @@ -831,18 +883,11 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - /* tree relocation relocated fields */ - struct list_head dead_reloc_roots; - struct btrfs_leaf_ref_tree reloc_ref_tree; - struct btrfs_leaf_ref_tree shared_ref_tree; - struct kobject super_kobj; struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; - atomic_t throttles; - atomic_t throttle_gen; u64 total_pinned; @@ -861,6 +906,8 @@ struct btrfs_fs_info { */ struct list_head space_info; + struct reloc_control *reloc_ctl; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -881,6 +928,9 @@ struct btrfs_fs_info { u64 metadata_alloc_profile; u64 system_alloc_profile; + unsigned data_chunk_allocations; + unsigned metadata_ratio; + void *bdev_holder; }; @@ -888,7 +938,6 @@ struct btrfs_fs_info { * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ -struct btrfs_dirty_root; struct btrfs_root { struct extent_buffer *node; @@ -896,9 +945,6 @@ struct btrfs_root { spinlock_t node_lock; struct extent_buffer *commit_root; - struct btrfs_leaf_ref_tree *ref_tree; - struct btrfs_leaf_ref_tree ref_tree_struct; - struct btrfs_dirty_root *dirty_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; @@ -949,10 +995,15 @@ struct btrfs_root { /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; + struct list_head root_list; + spinlock_t list_lock; - struct list_head dead_list; struct list_head orphan_list; + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1014,7 +1065,16 @@ struct btrfs_root { * are used, and how many references there are to each block */ #define BTRFS_EXTENT_ITEM_KEY 168 -#define BTRFS_EXTENT_REF_KEY 180 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 /* * block groups give us hints into the extent allocation trees. Which @@ -1040,6 +1100,8 @@ struct btrfs_root { #define BTRFS_MOUNT_COMPRESS (1 << 5) #define BTRFS_MOUNT_NOTREELOG (1 << 6) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) +#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) +#define BTRFS_MOUNT_NOSSD (1 << 9) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1053,12 +1115,14 @@ struct btrfs_root { #define BTRFS_INODE_READONLY (1 << 2) #define BTRFS_INODE_NOCOMPRESS (1 << 3) #define BTRFS_INODE_PREALLOC (1 << 4) -#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ - ~BTRFS_INODE_##flag) -#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ - BTRFS_INODE_##flag) -#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ - BTRFS_INODE_##flag) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) + + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1314,24 +1378,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) return (u8 *)((unsigned long)dev + ptr); } -/* struct btrfs_extent_ref */ -BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); -BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); -BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, - num_refs, 32); +BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, + root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, + count, 32); -/* struct btrfs_extent_item */ -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, - refs, 32); +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + BUG(); + return 0; +} + +BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); +BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, + generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); +BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); @@ -1555,6 +1662,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) return (flags & flag) == flag; } +static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, + int rev) +{ + u64 flags = btrfs_header_flags(eb); + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) { unsigned long ptr = offsetof(struct btrfs_header, fsid); @@ -1787,39 +1909,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 bytenr); + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, u64 parent, - u64 root_objectid, - u64 ref_generation, - int level, - u64 hint, - u64 empty_size); + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, int level); -int btrfs_alloc_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 min_bytes, - u64 root_objectid, u64 ref_generation, - u64 owner, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, u64 data); -int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins); -int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, @@ -1827,18 +1942,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, struct extent_buffer *buf, - u32 *nr_extents); -int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, u32 nr_extents); -int btrfs_update_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *orig_buf, - struct extent_buffer *buf, int start_slot, int nr); + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin); + u64 root_objectid, u64 owner, u64 offset); + int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1846,13 +1961,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid); -int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 num_bytes, - u64 orig_parent, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid); + u64 root_objectid, u64 owner, u64 offset); + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); @@ -1864,16 +1974,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); -int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); -int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, u64 orig_start); -int btrfs_add_dead_reloc_root(struct btrfs_root *root); -int btrfs_cleanup_reloc_trees(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group); + u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -1888,13 +1991,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); /* ctree.c */ +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_merge_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *node_keys, - u64 *nodes, int lowest_level); int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); @@ -1915,6 +2017,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size); int btrfs_truncate_item(struct btrfs_trans_handle *trans, @@ -1941,9 +2045,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); -int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 bytenr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) @@ -2002,8 +2103,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, - struct btrfs_root *latest_root); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, @@ -2136,7 +2238,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -void btrfs_read_locked_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, int wait); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -2144,12 +2245,8 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); -struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, - struct btrfs_root *root, int wait); -struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *is_new); + struct btrfs_root *root); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, @@ -2165,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); @@ -2174,7 +2273,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_block); + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_block); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); @@ -2201,8 +2301,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ +#ifdef CONFIG_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); +#else +#define btrfs_check_acl NULL +#endif int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); +/* relocation.c */ +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d6c01c096a4..84e6781413b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -29,27 +29,87 @@ * add extents in the middle of btrfs_search_slot, and it allows * us to buffer up frequently modified backrefs in an rb tree instead * of hammering updates on the extent allocation tree. - * - * Right now this code is only used for reference counted trees, but - * the long term goal is to get rid of the similar code for delayed - * extent tree modifications. */ /* - * entries in the rb tree are ordered by the byte number of the extent - * and by the byte number of the parent block. + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, + struct btrfs_delayed_tree_ref *ref1) +{ + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type */ -static int comp_entry(struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 parent) +static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, + struct btrfs_delayed_data_ref *ref1) { - if (bytenr < ref->bytenr) + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * entries in the rb tree are ordered by the byte number of the extent, + * type of the delayed backrefs and content of delayed backrefs. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref2, + struct btrfs_delayed_ref_node *ref1) +{ + if (ref1->bytenr < ref2->bytenr) return -1; - if (bytenr > ref->bytenr) + if (ref1->bytenr > ref2->bytenr) return 1; - if (parent < ref->parent) + if (ref1->is_head && ref2->is_head) + return 0; + if (ref2->is_head) return -1; - if (parent > ref->parent) + if (ref1->is_head) return 1; + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { + return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), + btrfs_delayed_node_to_tree_ref(ref1)); + } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), + btrfs_delayed_node_to_data_ref(ref1)); + } + BUG(); return 0; } @@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref, * inserted. */ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - u64 bytenr, u64 parent, struct rb_node *node) { struct rb_node **p = &root->rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_node *ins; int cmp; + ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, rb_node); - cmp = comp_entry(entry, bytenr, parent); + cmp = comp_entry(entry, ins); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) @@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return entry; } - entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); rb_link_node(node, parent_node, p); rb_insert_color(node, root); return NULL; } /* - * find an entry based on (bytenr,parent). This returns the delayed - * ref if it was able to find one, or NULL if nothing was in that spot + * find an head entry based on bytenr. This returns the delayed ref + * head if it was able to find one, or NULL if nothing was in that spot */ -static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, - u64 bytenr, u64 parent, +static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, + u64 bytenr, struct btrfs_delayed_ref_node **last) { struct rb_node *n = root->rb_node; @@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, if (last) *last = entry; - cmp = comp_entry(entry, bytenr, parent); + if (bytenr < entry->bytenr) + cmp = -1; + else if (bytenr > entry->bytenr) + cmp = 1; + else if (!btrfs_delayed_ref_is_head(entry)) + cmp = 1; + else + cmp = 0; + if (cmp < 0) n = n->rb_left; else if (cmp > 0) @@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - tree_search(&delayed_refs->root, start, (u64)-1, &ref); + find_ref_head(&delayed_refs->root, start, &ref); if (ref) { struct btrfs_delayed_ref_node *tmp; @@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) { prev_node = rb_prev(&ref->rb_node); if (!prev_node) @@ -250,25 +318,28 @@ out: } /* - * helper function to lookup reference count + * helper function to lookup reference count and flags of extent. * * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. This way you - * can check to see what the reference count would be if all of the - * delayed refs are processed. + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. */ -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs) +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) { struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; - struct extent_buffer *leaf; struct btrfs_extent_item *ei; + struct extent_buffer *leaf; struct btrfs_key key; - u32 num_refs; + u32 item_size; + u64 num_refs; + u64 extent_flags; int ret; path = btrfs_alloc_path(); @@ -287,37 +358,60 @@ again: if (ret == 0) { leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); } else { num_refs = 0; + extent_flags = 0; ret = 0; } spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) { head = btrfs_delayed_node_to_head(ref); - if (mutex_trylock(&head->mutex)) { - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); - *refs = num_refs; - goto out; - } + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); + btrfs_release_path(root->fs_info->extent_root, path); - btrfs_release_path(root->fs_info->extent_root, path); + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); - mutex_lock(&head->mutex); + num_refs += ref->ref_mod; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } else { - *refs = num_refs; } + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; out: spin_unlock(&delayed_refs->lock); btrfs_free_path(path); @@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { - struct btrfs_delayed_ref *existing_ref; - struct btrfs_delayed_ref *ref; - - existing_ref = btrfs_delayed_node_to_ref(existing); - ref = btrfs_delayed_node_to_ref(update); - - if (ref->pin) - existing_ref->pin = 1; - - if (ref->action != existing_ref->action) { + if (update->action != existing->action) { /* * this is effectively undoing either an add or a * drop. We decrement the ref_mod, and if it goes @@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans, delayed_refs->num_entries--; if (trans->delayed_ref_updates) trans->delayed_ref_updates--; + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); } } else { - if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { - /* if we're adding refs, make sure all the - * details match up. The extent could - * have been totally freed and reallocated - * by a different owner before the delayed - * ref entries were removed. - */ - existing_ref->owner_objectid = ref->owner_objectid; - existing_ref->generation = ref->generation; - existing_ref->root = ref->root; - existing->num_bytes = update->num_bytes; - } + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); /* * the action on the existing ref matches * the action on the ref we're trying to add. @@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); + BUG_ON(existing_ref->is_data != ref->is_data); if (ref->must_insert_reserved) { /* if the extent was freed and then @@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } + if (ref->extent_op) { + if (!existing_ref->extent_op) { + existing_ref->extent_op = ref->extent_op; + } else { + if (ref->extent_op->update_key) { + memcpy(&existing_ref->extent_op->key, + &ref->extent_op->key, + sizeof(ref->extent_op->key)); + existing_ref->extent_op->update_key = 1; + } + if (ref->extent_op->update_flags) { + existing_ref->extent_op->flags_to_set |= + ref->extent_op->flags_to_set; + existing_ref->extent_op->update_flags = 1; + } + kfree(ref->extent_op); + } + } /* * update the reference mod on the head to reflect this new operation */ @@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } /* - * helper function to actually insert a delayed ref into the rbtree. + * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct - * overall modification count in the head node and properly dealing - * with updating existing nodes as new modifications are queued. + * overall modification count. */ -static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin) +static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, + int action, int is_data) { struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_ref *full_ref; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ - if (parent == (u64)-1) { - if (action == BTRFS_DROP_DELAYED_REF) - count_mod = -1; - else if (action == BTRFS_UPDATE_DELAYED_HEAD) - count_mod = 0; - } + if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + else if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update @@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not required. */ - if (action == BTRFS_ADD_DELAYED_EXTENT) { + if (action == BTRFS_ADD_DELAYED_EXTENT) must_insert_reserved = 1; - action = BTRFS_ADD_DELAYED_REF; - } else { + else must_insert_reserved = 0; - } - delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; - ref->parent = parent; + ref->num_bytes = num_bytes; ref->ref_mod = count_mod; + ref->type = 0; + ref->action = 0; + ref->is_head = 1; ref->in_tree = 1; + + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_head_ref(existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed tree ref into the rbtree. + */ +static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_tree_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; - if (btrfs_delayed_ref_is_head(ref)) { - head_ref = btrfs_delayed_node_to_head(ref); - head_ref->must_insert_reserved = must_insert_reserved; - INIT_LIST_HEAD(&head_ref->cluster); - mutex_init(&head_ref->mutex); + full_ref = btrfs_delayed_node_to_tree_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_BLOCK_REF_KEY; } else { - full_ref = btrfs_delayed_node_to_ref(ref); full_ref->root = ref_root; - full_ref->generation = ref_generation; - full_ref->owner_objectid = owner_objectid; - full_ref->pin = pin; - full_ref->action = action; + ref->type = BTRFS_TREE_BLOCK_REF_KEY; } + full_ref->level = level; - existing = tree_insert(&delayed_refs->root, bytenr, - parent, &ref->rb_node); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { - if (btrfs_delayed_ref_is_head(ref)) - update_existing_head_ref(existing, ref); - else - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed data ref into the rbtree. + */ +static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_data_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_data_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_DATA_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_EXTENT_DATA_REF_KEY; + } + full_ref->objectid = owner; + full_ref->offset = offset; + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); /* * we've updated the existing ref, free the newly * allocated ref */ kfree(ref); } else { - if (btrfs_delayed_ref_is_head(ref)) { - delayed_refs->num_heads++; - delayed_refs->num_heads_ready++; - } delayed_refs->num_entries++; trans->delayed_ref_updates++; } @@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, } /* - * add a delayed ref to the tree. This does all of the accounting required + * add a delayed tree ref. This does all of the accounting required * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin) +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_delayed_ref *ref; + struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; int ret; + BUG_ON(extent_op && extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); if (!ref) return -ENOMEM; + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + /* - * the parent = 0 case comes from cases where we don't actually - * know the parent yet. It will get updated later via a add/drop - * pair. + * insert both the head node and the new ref without dropping + * the spin lock */ - if (parent == 0) - parent = bytenr; + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 0); + BUG_ON(ret); + + ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, level, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && !extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); if (!head_ref) { kfree(ref); return -ENOMEM; } + + head_ref->extent_op = extent_op; + delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, - (u64)-1, 0, 0, 0, action, pin); + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 1); BUG_ON(ret); - ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, ref_generation, - owner_objectid, action, pin); + ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, owner, offset, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); BUG_ON(ret); + spin_unlock(&delayed_refs->lock); return 0; } @@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; @@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) * * It is the same as doing a ref add and delete in two separate calls. */ +#if 0 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, @@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 0; } +#endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 3bec2ff0b15..f6fc67ddad3 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node { /* the starting bytenr of the extent */ u64 bytenr; - /* the parent our backref will point to */ - u64 parent; - /* the size of the extent */ u64 num_bytes; @@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node { */ int ref_mod; + unsigned int action:8; + unsigned int type:8; /* is this node still in the rbtree? */ + unsigned int is_head:1; unsigned int in_tree:1; }; +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u64 flags_to_set; + unsigned int update_key:1; + unsigned int update_flags:1; + unsigned int is_data:1; +}; + /* * the head refs are used to hold a lock on a given extent, which allows us * to make sure that only one process is running the delayed refs @@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head { struct list_head cluster; + struct btrfs_delayed_extent_op *extent_op; /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head { * the free has happened. */ unsigned int must_insert_reserved:1; + unsigned int is_data:1; }; -struct btrfs_delayed_ref { +struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + int level; +}; - /* the root objectid our ref will point to */ - u64 root; - - /* the generation for the backref */ - u64 generation; - - /* owner_objectid of the backref */ - u64 owner_objectid; - - /* operation done by this entry in the rbtree */ - u8 action; - - /* if pin == 1, when the extent is freed it will be pinned until - * transaction commit - */ - unsigned int pin:1; +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + u64 objectid; + u64 offset; }; struct btrfs_delayed_ref_root { @@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin); +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, @@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, */ static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) { - return node->parent == (u64)-1; + return node->is_head; } /* * helper functions to cast a node into its container */ -static inline struct btrfs_delayed_ref * -btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) { WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref, node); + return container_of(node, struct btrfs_delayed_tree_ref, node); +} +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_data_ref, node); } static inline struct btrfs_delayed_ref_head * @@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) { WARN_ON(!btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_ref_head, node); - } #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92caa8035f3..0d50d49d990 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,8 +26,8 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/crc32c.h> #include "compat.h" -#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -36,7 +36,6 @@ #include "print-tree.h" #include "async-thread.h" #include "locking.h" -#include "ref-cache.h" #include "tree-log.h" #include "free-space-cache.h" @@ -172,7 +171,7 @@ out: u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) { - return btrfs_crc32c(seed, data, len); + return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -232,10 +231,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk(KERN_INFO "btrfs: %s checksum verify failed " - "on %llu wanted %X found %X level %d\n", - root->fs_info->sb->s_id, - buf->start, val, found, btrfs_header_level(buf)); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } if (result != (char *)&inline_result) kfree(result); return 1; @@ -268,10 +271,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk("parent transid verify failed on %llu wanted %llu found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } ret = 1; clear_extent_buffer_uptodate(io_tree, eb); out: @@ -415,9 +421,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -429,8 +438,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -579,19 +590,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; atomic_inc(&fs_info->nr_async_submits); + + if (rw & (1 << BIO_RW_SYNCIO)) + btrfs_set_work_high_prio(&async->work); + btrfs_queue_worker(&fs_info->workers, &async->work); -#if 0 - int limit = btrfs_async_submit_limit(fs_info); - if (atomic_read(&fs_info->nr_async_submits) > limit) { - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) < limit), - HZ/10); - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_bios) < limit), - HZ/10); - } -#endif while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, @@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } + /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs @@ -765,27 +770,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -#if 0 -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct buffer_head *bh; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct buffer_head *head; - if (!page_has_buffers(page)) { - create_empty_buffers(page, root->fs_info->sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - head = page_buffers(page); - bh = head; - do { - if (buffer_dirty(bh)) - csum_tree_block(root, bh, 0); - bh = bh->b_this_page; - } while (bh != head); - return block_write_full_page(page, btree_get_block, wbc); -} -#endif - static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, @@ -863,8 +847,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, if (ret == 0) set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); - else - WARN_ON(1); return buf; } @@ -901,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, { root->node = NULL; root->commit_root = NULL; - root->ref_tree = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; root->leafsize = leafsize; @@ -916,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_inode_alloc = 0; root->name = NULL; root->in_sysfs = 0; + root->inode_tree.rb_node = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); - INIT_LIST_HEAD(&root->dead_list); + INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->node_lock); spin_lock_init(&root->list_lock); + spin_lock_init(&root->inode_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -935,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); - btrfs_leaf_ref_tree_init(&root->ref_tree_struct); - root->ref_tree = &root->ref_tree_struct; - memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -976,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); return 0; } @@ -1042,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ root->ref_cows = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); } + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; - btrfs_set_header_nritems(root->node, 0); - btrfs_set_header_level(root->node, 0); - btrfs_set_header_bytenr(root->node, root->node->start); - btrfs_set_header_generation(root->node, trans->transid); - btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); write_extent_buffer(root->node, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(root->node), @@ -1098,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); - btrfs_set_root_generation(&log_root->root_item, trans->transid); + btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; @@ -1161,6 +1140,7 @@ out: blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); insert: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -1227,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, } if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid, root); + root->root_key.objectid); BUG_ON(ret); btrfs_orphan_cleanup(root); } @@ -1273,11 +1253,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; -#if 0 - if ((bdi_bits & (1 << BDI_write_congested)) && - btrfs_congested_async(info, 0)) - return 1; -#endif + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -1590,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); - atomic_set(&fs_info->throttles, 0); - atomic_set(&fs_info->throttle_gen, 0); fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; @@ -1599,6 +1573,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->metadata_ratio = 8; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1618,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping->a_ops = &btree_aops; fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1633,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; - INIT_LIST_HEAD(&fs_info->dead_reloc_roots); - btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); - btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); - BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); @@ -1689,17 +1661,23 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } + features = btrfs_super_incompat_flags(disk_super); + if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + btrfs_set_super_incompat_flags(disk_super, features); + } + features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -1791,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read the system " "array on %s\n", sb->s_id); - goto fail_sys_array; + goto fail_sb_buffer; } blocksize = btrfs_level_size(tree_root, @@ -1805,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); + chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), @@ -1830,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; - + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); @@ -1840,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); - dev_root->track_dirty = 1; if (ret) goto fail_extent_root; + dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_extent_root; + goto fail_dev_root; csum_root->track_dirty = 1; @@ -1869,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; + if (!btrfs_test_opt(tree_root, SSD) && + !btrfs_test_opt(tree_root, NOSSD) && + !fs_info->fs_devices->rotating) { + printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + "mode\n"); + btrfs_set_opt(fs_info->mount_opt, SSD); + } + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); @@ -1901,7 +1890,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, } if (!(sb->s_flags & MS_RDONLY)) { - ret = btrfs_cleanup_reloc_trees(tree_root); + ret = btrfs_recover_relocation(tree_root); BUG_ON(ret); } @@ -1912,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) goto fail_trans_kthread; + return tree_root; fail_trans_kthread: @@ -1928,14 +1918,19 @@ fail_cleaner: fail_csum_root: free_extent_buffer(csum_root->node); + free_extent_buffer(csum_root->commit_root); +fail_dev_root: + free_extent_buffer(dev_root->node); + free_extent_buffer(dev_root->commit_root); fail_extent_root: free_extent_buffer(extent_root->node); + free_extent_buffer(extent_root->commit_root); fail_tree_root: free_extent_buffer(tree_root->node); + free_extent_buffer(tree_root->commit_root); fail_chunk_root: free_extent_buffer(chunk_root->node); -fail_sys_array: - free_extent_buffer(dev_root->node); + free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); @@ -2025,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) return latest; } +/* + * this should be called twice, once with wait == 0 and + * once with wait == 1. When wait == 0 is done, all the buffer heads + * we write are pinned. + * + * They are released when wait == 1 is done. + * max_mirrors must be the same for both runs, and it indicates how + * many supers on this one device should be written. + * + * max_mirrors == 0 means to write them all. + */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int do_barriers, int wait, int max_mirrors) @@ -2060,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device, bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); BUG_ON(!bh); - brelse(bh); wait_on_buffer(bh); - if (buffer_uptodate(bh)) { - brelse(bh); - continue; - } + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the wait == 0 run */ + brelse(bh); + continue; } else { btrfs_set_super_bytenr(sb, bytenr); @@ -2076,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, sb->csum); + /* + * one reference for us, and we leave it for the + * caller + */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - set_buffer_uptodate(bh); + /* one reference for submit_bh */ get_bh(bh); + + set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; } @@ -2093,30 +2109,24 @@ static int write_dev_supers(struct btrfs_device *device, device->name); set_buffer_uptodate(bh); device->barriers = 0; + /* one reference for submit_bh */ get_bh(bh); lock_buffer(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } } else { - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } - if (!ret && wait) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; - } else if (ret) { + if (ret) errors++; - } - if (wait) - brelse(bh); } return errors < i ? 0 : -1; } int write_all_supers(struct btrfs_root *root, int max_mirrors) { - struct list_head *head = &root->fs_info->fs_devices->devices; + struct list_head *head; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; @@ -2131,6 +2141,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) sb = &root->fs_info->super_for_commit; dev_item = &sb->dev_item; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2174,6 +2187,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) total_errors++; } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); @@ -2193,6 +2207,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (root->anon_super.s_dev) { @@ -2239,10 +2254,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) ARRAY_SIZE(gang)); if (!ret) break; + + root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { root_objectid = gang[i]->root_key.objectid; ret = btrfs_find_dead_roots(fs_info->tree_root, - root_objectid, gang[i]); + root_objectid); BUG_ON(ret); btrfs_orphan_cleanup(gang[i]); } @@ -2291,27 +2308,23 @@ int close_ctree(struct btrfs_root *root) if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - fs_info->delalloc_bytes); + (unsigned long long)fs_info->delalloc_bytes); } if (fs_info->total_ref_cache_size) { printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", (unsigned long long)fs_info->total_ref_cache_size); } - if (fs_info->extent_root->node) - free_extent_buffer(fs_info->extent_root->node); - - if (fs_info->tree_root->node) - free_extent_buffer(fs_info->tree_root->node); - - if (root->fs_info->chunk_root->node) - free_extent_buffer(root->fs_info->chunk_root->node); - - if (root->fs_info->dev_root->node) - free_extent_buffer(root->fs_info->dev_root->node); - - if (root->fs_info->csum_root->node) - free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(fs_info->extent_root->node); + free_extent_buffer(fs_info->extent_root->commit_root); + free_extent_buffer(fs_info->tree_root->node); + free_extent_buffer(fs_info->tree_root->commit_root); + free_extent_buffer(root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->commit_root); + free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); @@ -2328,16 +2341,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); -#if 0 - while (!list_empty(&fs_info->hashers)) { - struct btrfs_hasher *hasher; - hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, - hashers); - list_del(&hasher->hashers); - crypto_free_hash(&fs_info->hash_tfm); - kfree(hasher); - } -#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2403,17 +2406,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - struct extent_io_tree *tree; u64 num_dirty; - u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; - tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; if (current->flags & PF_MEMALLOC) return; - num_dirty = count_range_bits(tree, &start, (u64)-1, - thresh, EXTENT_DIRTY); + num_dirty = root->fs_info->dirty_metadata_bytes; + if (num_dirty > thresh) { balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 85315d2c90d..9596b40caa4 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - inode = btrfs_iget(sb, &key, root, NULL); + inode = btrfs_iget(sb, &key, root); if (IS_ERR(inode)) return (void *)inode; @@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); } const struct export_operations btrfs_export_ops = { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 178df4c67de..edc7d208c5c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,50 +23,39 @@ #include <linux/rcupdate.h> #include "compat.h" #include "hash.h" -#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" #include "transaction.h" #include "volumes.h" #include "locking.h" -#include "ref-cache.h" #include "free-space-cache.h" -#define PENDING_EXTENT_INSERT 0 -#define PENDING_EXTENT_DELETE 1 -#define PENDING_BACKREF_UPDATE 2 - -struct pending_extent_op { - int type; - u64 bytenr; - u64 num_bytes; - u64 parent; - u64 orig_parent; - u64 generation; - u64 orig_generation; - int level; - struct list_head list; - int del; -}; - -static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins, - int ref_mod); static int update_reserved_extents(struct btrfs_root *root, u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); -static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, - int ref_to_drop); +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -312,7 +301,7 @@ btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) } /* - * return the block group that contains teh given bytenr + * return the block group that contains the given bytenr */ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, @@ -453,199 +442,969 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) * maintenance. This is actually the same as #2, but with a slightly * different use case. * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COW'd through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * - * The extent ref structure has fields for: + * The extent ref structure for the implicit back refs has fields for: * * - Objectid of the subvolume root - * - Generation number of the tree holding the reference * - objectid of the file holding the reference - * - number of references holding by parent node (alway 1 for tree blocks) - * - * Btree leaf may hold multiple references to a file extent. In most cases, - * these references are from same file and the corresponding offsets inside - * the file are close together. - * - * When a file extent is allocated the fields are filled in: - * (root_key.objectid, trans->transid, inode objectid, 1) - * - * When a leaf is cow'd new references are added for every file extent found - * in the leaf. It looks similar to the create case, but trans->transid will - * be different when the block is cow'd. + * - original offset in the file + * - how many bookend extents * - * (root_key.objectid, trans->transid, inode objectid, - * number of references in the leaf) + * The key offset for the implicit back refs is hash of the first + * three fields. * - * When a file extent is removed either during snapshot deletion or - * file truncation, we find the corresponding back reference and check - * the following fields: + * The extent ref structure for the full back refs has field for: * - * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), - * inode objectid) + * - number of pointers in the tree leaf * - * Btree extents can be referenced by: - * - * - Different subvolumes - * - Different generations of the same subvolume + * The key offset for the implicit back refs is the first byte of + * the tree leaf * - * When a tree block is created, back references are inserted: + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: * - * (root->root_key.objectid, trans->transid, level, 1) + * (root_key.objectid, inode objectid, offset in file, 1) * - * When a tree block is cow'd, new back references are added for all the - * blocks it points to. If the tree block isn't in reference counted root, - * the old back references are removed. These new back references are of - * the form (trans->transid will have increased since creation): + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: * - * (root->root_key.objectid, trans->transid, level, 1) + * (btrfs_header_owner(leaf), inode objectid, offset in file) * - * When a backref is in deleting, the following fields are checked: + * Btree extents can be referenced by: * - * if backref was for a tree root: - * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) - * else - * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * - Different subvolumes * - * Back Reference Key composing: + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. * - * The key objectid corresponds to the first byte in the extent, the key - * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first - * byte of parent extent. If a extent is tree root, the key offset is set - * to the key objectid. + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. */ -static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid, int del) +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int convert_extent_item_v0(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 owner, u32 extra_size) { + struct btrfs_extent_item *item; + struct btrfs_extent_item_v0 *ei0; + struct btrfs_extent_ref_v0 *ref0; + struct btrfs_tree_block_info *bi; + struct extent_buffer *leaf; struct btrfs_key key; - struct btrfs_extent_ref *ref; + struct btrfs_key found_key; + u32 new_size = sizeof(*item); + u64 refs; + int ret; + + leaf = path->nodes[0]; + BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + refs = btrfs_extent_refs_v0(leaf, ei0); + + if (owner == (u64)-1) { + while (1) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0]); + BUG_ON(key.objectid != found_key.objectid); + if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { + path->slots[0]++; + continue; + } + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + owner = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + } + btrfs_release_path(root, path); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) + new_size += sizeof(*bi); + + new_size -= sizeof(*ei0); + ret = btrfs_search_slot(trans, root, &key, path, + new_size + extra_size, 1); + if (ret < 0) + return ret; + BUG_ON(ret); + + ret = btrfs_extend_item(trans, root, path, new_size); + BUG_ON(ret); + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, refs); + /* FIXME: get real generation */ + btrfs_set_extent_generation(leaf, item, 0); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_set_extent_flags(leaf, item, + BTRFS_EXTENT_FLAG_TREE_BLOCK | + BTRFS_BLOCK_FLAG_FULL_BACKREF); + bi = (struct btrfs_tree_block_info *)(item + 1); + /* FIXME: get first key of the block */ + memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + btrfs_set_tree_block_level(leaf, bi, (int)owner); + } else { + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} +#endif + +static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; - u64 ref_objectid; + u32 nritems; int ret; + int recow; + int err = -ENOENT; key.objectid = bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = parent; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } - ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; + if (parent) { + if (!ret) + return 0; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + key.type = BTRFS_EXTENT_REF_V0_KEY; + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (!ret) + return 0; +#endif + goto fail; } leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - ref_objectid = btrfs_ref_objectid(leaf, ref); - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != ref_generation || - (ref_objectid != owner_objectid && - ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { - ret = -EIO; - WARN_ON(1); - goto out; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; } - ret = 0; -out: - return ret; +fail: + return err; } -static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid, - int refs_to_add) +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) { struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref; + u32 size; u32 num_refs; int ret; key.objectid = bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = parent; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); - if (ret == 0) { - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - btrfs_set_ref_root(leaf, ref, ref_root); - btrfs_set_ref_generation(leaf, ref, ref_generation); - btrfs_set_ref_objectid(leaf, ref, owner_objectid); - btrfs_set_ref_num_refs(leaf, ref, refs_to_add); - } else if (ret == -EEXIST) { - u64 existing_owner; - - BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); - leaf = path->nodes[0]; + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != ref_generation) { - ret = -EIO; - WARN_ON(1); - goto out; + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(root, path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; - num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs == 0); - btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); - - existing_owner = btrfs_ref_objectid(leaf, ref); - if (existing_owner != owner_objectid && - existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { - btrfs_set_ref_objectid(leaf, ref, - BTRFS_MULTIPLE_OBJECTIDS); + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } - ret = 0; - } else { - goto out; } - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(path->nodes[0]); -out: + btrfs_mark_buffer_dirty(leaf); + ret = 0; +fail: btrfs_release_path(root, path); return ret; } -static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int refs_to_drop) +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) { + struct btrfs_key key; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref; - u32 num_refs; + u32 num_refs = 0; int ret = 0; leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - num_refs = btrfs_ref_num_refs(leaf, ref); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + BUG(); + } + BUG_ON(num_refs < refs_to_drop); num_refs -= refs_to_drop; + if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { - btrfs_set_ref_num_refs(leaf, ref, num_refs); + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + else { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + btrfs_set_ref_count_v0(leaf, ref0, num_refs); + } +#endif btrfs_mark_buffer_dirty(leaf); } + return ret; +} + +static noinline u32 extent_data_ref_count(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (iref) { + if (btrfs_extent_inline_ref_type(leaf, iref) == + BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + WARN_ON(1); + } + return num_refs; +} + +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ret == -ENOENT && parent) { + btrfs_release_path(root, path); + key.type = BTRFS_EXTENT_REF_V0_KEY; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + } +#endif + return ret; +} + +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(root, path); return ret; } +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; +} + +static int find_next_key(struct btrfs_path *path, struct btrfs_key *key) + +{ + int level; + BUG_ON(!path->keep_locks); + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + btrfs_assert_tree_locked(path->nodes[level]); + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} + +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->keep_locks = 1; + } else + extra_size = -1; + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + if (!insert) { + err = -ENOENT; + goto out; + } + ret = convert_extent_item_v0(trans, root, path, owner, + extra_size); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; + } + } + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (find_next_key(path, &key) == 0 && key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; + } + } + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; +} + +/* + * helper to add new inline back ref + */ +static noinline_for_stack +int setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; + int ret; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; + + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); + + ret = btrfs_extend_item(trans, root, path, size); + BUG_ON(ret); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 0); + if (ret != -ENOENT) + return ret; + + btrfs_release_path(root, path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, + root_objectid, owner, offset); + } + return ret; +} + +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack +int update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + int ret; + u64 refs; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_extent_inline_ref_type(leaf, iref); + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + BUG_ON(refs_to_mod != -1); + } + + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + ret = btrfs_truncate_item(trans, root, path, item_size, 1); + BUG_ON(ret); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + ret = update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); + } else if (ret == -ENOENT) { + ret = setup_inline_extent_backref(trans, root, path, iref, + parent, root_objectid, + owner, offset, refs_to_add, + extent_op); + } + return ret; +} + +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, root, path, bytenr, + parent, root_objectid); + } else { + ret = insert_extent_data_ref(trans, root, path, bytenr, + parent, root_objectid, + owner, offset, refs_to_add); + } + return ret; +} + +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data) +{ + int ret; + + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + ret = update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); + } else if (is_data) { + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + } else { + ret = btrfs_del_item(trans, root, path); + } + return ret; +} + #ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) @@ -686,71 +1445,40 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, #endif } -static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, - u64 orig_parent, u64 parent, - u64 orig_root, u64 ref_root, - u64 orig_generation, u64 ref_generation, - u64 owner_objectid) +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) { int ret; - int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && + root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, - orig_parent, parent, orig_root, - ref_root, orig_generation, - ref_generation, owner_objectid, pin); - BUG_ON(ret); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_REF, NULL); + } return ret; } -int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 orig_parent, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) -{ - int ret; - if (ref_root == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - return 0; - - ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, parent, ref_root, - ref_root, ref_generation, - ref_generation, owner_objectid); - return ret; -} static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, - u64 orig_parent, u64 parent, - u64 orig_root, u64 ref_root, - u64 orig_generation, u64 ref_generation, - u64 owner_objectid) -{ - int ret; - - ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, - ref_generation, owner_objectid, - BTRFS_ADD_DELAYED_REF, 0); - BUG_ON(ret); - return ret; -} - -static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, - int refs_to_add) + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct extent_buffer *l; + struct extent_buffer *leaf; struct btrfs_extent_item *item; - u32 refs; + u64 refs; + int ret; + int err = 0; path = btrfs_alloc_path(); if (!path) @@ -758,43 +1486,27 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, path->reada = 1; path->leave_spinning = 1; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - /* first find the extent item and update its reference count */ - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, - path, 0, 1); - if (ret < 0) { - btrfs_set_path_blocking(path); - return ret; - } - - if (ret > 0) { - WARN_ON(1); - btrfs_free_path(path); - return -EIO; - } - l = path->nodes[0]; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, num_bytes, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + if (ret == 0) + goto out; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); - if (key.objectid != bytenr) { - btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); - printk(KERN_ERR "btrfs wanted %llu found %llu\n", - (unsigned long long)bytenr, - (unsigned long long)key.objectid); - BUG(); + if (ret != -EAGAIN) { + err = ret; + goto out; } - BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); - - item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(l, item); - btrfs_set_extent_refs(l, item, refs + refs_to_add); - btrfs_unlock_up_safe(path, 1); - - btrfs_mark_buffer_dirty(path->nodes[0]); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; @@ -802,56 +1514,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, parent, - ref_root, ref_generation, - owner_objectid, refs_to_add); + path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); BUG_ON(ret); +out: btrfs_free_path(path); - return 0; + return err; } -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { - int ret; - if (ref_root == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - return 0; + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, - 0, ref_root, 0, ref_generation, - owner_objectid); + ref = btrfs_delayed_node_to_data_ref(node); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) { + BUG_ON(extent_op->update_key); + flags |= extent_op->flags_to_set; + } + ret = alloc_reserved_file_extent(trans, root, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); + update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else { + BUG(); + } return ret; } -static int drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node) +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } + + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + path->reada = 1; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + err = -EIO; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + path, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { int ret = 0; - struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; - BUG_ON(node->ref_mod == 0); - ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, - node->parent, ref->root, ref->generation, - ref->owner_objectid, ref->pin, node->ref_mod); + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ref = btrfs_delayed_node_to_tree_ref(node); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + BUG_ON(node->ref_mod != 1); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags || + !extent_op->update_key); + ret = alloc_reserved_tree_block(trans, root, + parent, ref_root, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); + update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else { + BUG(); + } return ret; } + /* helper function to actually process a single delayed ref entry */ -static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - int insert_reserved) +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { int ret; - struct btrfs_delayed_ref *ref; - - if (node->parent == (u64)-1) { + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* * we've hit the end of the chain and we were supposed @@ -859,44 +1712,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, * deleted before we ever needed to insert it, so all * we have to do is clean up the accounting */ + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); if (insert_reserved) { + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } + btrfs_update_pinned_extents(root, node->bytenr, + node->num_bytes, 1); update_reserved_extents(root, node->bytenr, node->num_bytes, 0); } - head = btrfs_delayed_node_to_head(node); mutex_unlock(&head->mutex); return 0; } - ref = btrfs_delayed_node_to_ref(node); - if (ref->action == BTRFS_ADD_DELAYED_REF) { - if (insert_reserved) { - struct btrfs_key ins; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - /* record the full extent allocation */ - ret = __btrfs_alloc_reserved_extent(trans, root, - node->parent, ref->root, - ref->generation, ref->owner_objectid, - &ins, node->ref_mod); - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); - } else { - /* just add one backref */ - ret = add_extent_ref(trans, root, node->bytenr, - node->num_bytes, - node->parent, ref->root, ref->generation, - ref->owner_objectid, node->ref_mod); - } - BUG_ON(ret); - } else if (ref->action == BTRFS_DROP_DELAYED_REF) { - WARN_ON(insert_reserved); - ret = drop_delayed_ref(trans, root, node); - } - return 0; + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, root, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, root, node, extent_op, + insert_reserved); + else + BUG(); + return ret; } static noinline struct btrfs_delayed_ref_node * @@ -919,7 +1763,7 @@ again: rb_node); if (ref->bytenr != head->node.bytenr) break; - if (btrfs_delayed_node_to_ref(ref)->action == action) + if (ref->action == action) return ref; node = rb_prev(node); } @@ -937,6 +1781,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; + struct btrfs_delayed_extent_op *extent_op; int ret; int count = 0; int must_insert_reserved = 0; @@ -975,6 +1820,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, must_insert_reserved = locked_ref->must_insert_reserved; locked_ref->must_insert_reserved = 0; + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + /* * locked_ref is the head node, so we have to go one * node back for any delayed ref updates @@ -986,6 +1834,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * so that any accounting fixes can happen */ ref = &locked_ref->node; + + if (extent_op && must_insert_reserved) { + kfree(extent_op); + extent_op = NULL; + } + + if (extent_op) { + spin_unlock(&delayed_refs->lock); + + ret = run_delayed_extent_op(trans, root, + ref, extent_op); + BUG_ON(ret); + kfree(extent_op); + + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + list_del_init(&locked_ref->cluster); locked_ref = NULL; } @@ -993,14 +1860,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root, ref, + ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); BUG_ON(ret); - btrfs_put_delayed_ref(ref); + btrfs_put_delayed_ref(ref); + kfree(extent_op); count++; + cond_resched(); spin_lock(&delayed_refs->lock); } @@ -1095,25 +1965,112 @@ out: return 0; } -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 bytenr) +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = 1; + extent_op->update_key = 0; + extent_op->is_data = is_data ? 1 : 0; + + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + if (ret) + kfree(extent_op); + return ret; +} + +static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + int ret = 0; + + ret = -ENOENT; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out_unlock; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + if (ref->bytenr != bytenr) + goto out_unlock; + + ret = 1; + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) + goto out_unlock; + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + node = rb_prev(node); + if (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (ref->bytenr == bytenr) + goto out_unlock; + } + + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || data_ref->offset != offset) + goto out_unlock; + + ret = 0; +out_unlock: + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +static noinline int check_committed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) { struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref_item; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; struct btrfs_key key; - struct btrfs_key found_key; - u64 ref_root; - u64 last_snapshot; - u32 nritems; + u32 item_size; int ret; key.objectid = bytenr; key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; - path = btrfs_alloc_path(); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1125,55 +2082,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, path->slots[0]--; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (found_key.objectid != bytenr || - found_key.type != BTRFS_EXTENT_ITEM_KEY) + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) goto out; - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret == 0) - continue; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != bytenr) - break; + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + goto out; + } +#endif + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - if (found_key.type != BTRFS_EXTENT_REF_KEY) { - path->slots[0]++; - continue; - } + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; - ref_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - ref_root = btrfs_ref_root(leaf, ref_item); - if ((ref_root != root->root_key.objectid && - ref_root != BTRFS_TREE_LOG_OBJECTID) || - objectid != btrfs_ref_objectid(leaf, ref_item)) { - ret = 1; - goto out; - } - if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { - ret = 1; + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + if (btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_path *path; + int ret; + int ret2; + + path = btrfs_alloc_path(); + if (!path) + return -ENOENT; + + do { + ret = check_committed_ref(trans, root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) goto out; - } - path->slots[0]++; + ret2 = check_delayed_ref(trans, root, path, objectid, + offset, bytenr); + } while (ret2 == -EAGAIN); + + if (ret2 && ret2 != -ENOENT) { + ret = ret2; + goto out; } - ret = 0; + + if (ret != -ENOENT || ret2 != -ENOENT) + ret = 0; out: btrfs_free_path(path); return ret; } +#if 0 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u32 nr_extents) { @@ -1291,62 +2276,44 @@ static int refsort_cmp(const void *a_void, const void *b_void) return 1; return 0; } +#endif - -noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, - struct extent_buffer *buf, u32 *nr_extents) + struct extent_buffer *buf, + int full_backref, int inc) { u64 bytenr; + u64 num_bytes; + u64 parent; u64 ref_root; - u64 orig_root; - u64 ref_generation; - u64 orig_generation; - struct refsort *sorted; u32 nritems; - u32 nr_file_extents = 0; struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int level; int ret = 0; - int faili = 0; - int refi = 0; - int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); - ref_generation = btrfs_header_generation(buf); - orig_root = btrfs_header_owner(orig_buf); - orig_generation = btrfs_header_generation(orig_buf); - nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); - BUG_ON(!sorted); + if (!root->ref_cows && level == 0) + return 0; - if (root->ref_cows) { - process_func = __btrfs_inc_extent_ref; - } else { - if (level == 0 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - goto out; - if (level != 0 && - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) - goto out; - process_func = __btrfs_update_extent_ref; - } + if (inc) + process_func = btrfs_inc_extent_ref; + else + process_func = btrfs_free_extent; + + if (full_backref) + parent = buf->start; + else + parent = 0; - /* - * we make two passes through the items. In the first pass we - * only record the byte number and slot. Then we sort based on - * byte number and do the actual work based on the sorted results - */ for (i = 0; i < nritems; i++) { - cond_resched(); if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) @@ -1360,151 +2327,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, if (bytenr == 0) continue; - nr_file_extents++; - sorted[refi].bytenr = bytenr; - sorted[refi].slot = i; - refi++; - } else { - bytenr = btrfs_node_blockptr(buf, i); - sorted[refi].bytenr = bytenr; - sorted[refi].slot = i; - refi++; - } - } - /* - * if refi == 0, we didn't actually put anything into the sorted - * array and we're done - */ - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - for (i = 0; i < refi; i++) { - cond_resched(); - slot = sorted[i].slot; - bytenr = sorted[i].bytenr; - - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, slot); - fi = btrfs_item_ptr(buf, slot, - struct btrfs_file_extent_item); - - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - - ret = process_func(trans, root, bytenr, - btrfs_file_extent_disk_num_bytes(buf, fi), - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); - - if (ret) { - faili = slot; - WARN_ON(1); + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, key.objectid, + key.offset); + if (ret) goto fail; - } } else { - ret = process_func(trans, root, bytenr, buf->len, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - level - 1); - if (ret) { - faili = slot; - WARN_ON(1); + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = btrfs_level_size(root, level - 1); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, level - 1, 0); + if (ret) goto fail; - } } } -out: - kfree(sorted); - if (nr_extents) { - if (level == 0) - *nr_extents = nr_file_extents; - else - *nr_extents = nritems; - } return 0; fail: - kfree(sorted); - WARN_ON(1); + BUG(); return ret; } -int btrfs_update_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *orig_buf, - struct extent_buffer *buf, int start_slot, int nr) - +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) { - u64 bytenr; - u64 ref_root; - u64 orig_root; - u64 ref_generation; - u64 orig_generation; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - int i; - int ret; - int slot; - int level; - - BUG_ON(start_slot < 0); - BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); - - ref_root = btrfs_header_owner(buf); - ref_generation = btrfs_header_generation(buf); - orig_root = btrfs_header_owner(orig_buf); - orig_generation = btrfs_header_generation(orig_buf); - level = btrfs_header_level(buf); - - if (!root->ref_cows) { - if (level == 0 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - return 0; - if (level != 0 && - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) - return 0; - } + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} - for (i = 0, slot = start_slot; i < nr; i++, slot++) { - cond_resched(); - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, slot, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - ret = __btrfs_update_extent_ref(trans, root, bytenr, - btrfs_file_extent_disk_num_bytes(buf, fi), - orig_buf->start, buf->start, - orig_root, ref_root, orig_generation, - ref_generation, key.objectid); - if (ret) - goto fail; - } else { - bytenr = btrfs_node_blockptr(buf, slot); - ret = __btrfs_update_extent_ref(trans, root, bytenr, - buf->len, orig_buf->start, - buf->start, orig_root, ref_root, - orig_generation, ref_generation, - level - 1); - if (ret) - goto fail; - } - } - return 0; -fail: - WARN_ON(1); - return -1; +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -1844,10 +2698,14 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " "%llu bytes_pinned, %llu bytes_readonly, %llu may use" - "%llu total\n", bytes, data_sinfo->bytes_delalloc, - data_sinfo->bytes_used, data_sinfo->bytes_reserved, - data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, - data_sinfo->bytes_may_use, data_sinfo->total_bytes); + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -1918,15 +2776,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, spin_unlock(&info->lock); } +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; u64 thresh; int ret = 0; - mutex_lock(&extent_root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); flags = btrfs_reduce_alloc_profile(extent_root, flags); @@ -1958,6 +2830,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } spin_unlock(&space_info->lock); + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret) space_info->full = 1; @@ -1977,6 +2861,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + old_val = btrfs_super_bytes_used(&info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(&info->super_copy, old_val); + + /* block accounting for root item */ + old_val = btrfs_root_used(&root->root_item); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_root_used(&root->root_item, old_val); + spin_unlock(&info->delalloc_lock); + while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) @@ -2186,8 +3088,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_owner = btrfs_header_owner(buf); u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_owner != BTRFS_TREE_RELOC_OBJECTID && - header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *must_clean = buf; @@ -2205,63 +3105,77 @@ pinit: return 0; } -/* - * remove an extent from the root, returns 0 on success - */ -static int __free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, int mark_free, - int refs_to_drop) + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; struct btrfs_key key; + struct btrfs_path *path; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; int ret; + int is_data; int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - struct btrfs_extent_item *ei; - u32 refs; + u32 item_size; + u64 refs; - key.objectid = bytenr; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - key.offset = num_bytes; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = 1; path->leave_spinning = 1; - ret = lookup_extent_backref(trans, extent_root, path, - bytenr, parent, root_objectid, - ref_generation, owner_objectid, 1); + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + ret = lookup_extent_backref(trans, extent_root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner_objectid, + owner_offset); if (ret == 0) { - struct btrfs_key found_key; extent_slot = path->slots[0]; - while (extent_slot > 0) { - extent_slot--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, extent_slot); - if (found_key.objectid != bytenr) + if (key.objectid != bytenr) break; - if (found_key.type == BTRFS_EXTENT_ITEM_KEY && - found_key.offset == num_bytes) { + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { found_extent = 1; break; } if (path->slots[0] - extent_slot > 5) break; + extent_slot--; } +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); + if (found_extent && item_size < sizeof(*ei)) + found_extent = 0; +#endif if (!found_extent) { + BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, - refs_to_drop); + NULL, refs_to_drop, + is_data); BUG_ON(ret); btrfs_release_path(extent_root, path); path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -2277,82 +3191,98 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu gen %llu owner %llu\n", + "parent %llu root %llu owner %llu offset %llu\n", (unsigned long long)bytenr, (unsigned long long)parent, (unsigned long long)root_objectid, - (unsigned long long)ref_generation, - (unsigned long long)owner_objectid); + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); } leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + BUG_ON(found_extent || extent_slot != path->slots[0]); + ret = convert_extent_item_v0(trans, extent_root, path, + owner_objectid, 0); + BUG_ON(ret < 0); + + btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); + } +#endif + BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - - /* - * we're not allowed to delete the extent item if there - * are other delayed ref updates pending - */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } + refs = btrfs_extent_refs(leaf, ei); BUG_ON(refs < refs_to_drop); refs -= refs_to_drop; - btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); - if (refs == 0 && found_extent && - path->slots[0] == extent_slot + 1) { - struct btrfs_extent_ref *ref; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); - /* if the back ref and the extent are next to each other - * they get deleted below in one shot + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref */ - path->slots[0] = extent_slot; - num_to_del = 2; - } else if (found_extent) { - /* otherwise delete the extent back ref */ - ret = remove_extent_backref(trans, extent_root, path, - refs_to_drop); - BUG_ON(ret); - /* if refs are 0, we need to setup the path for deletion */ - if (refs == 0) { - btrfs_release_path(extent_root, path); - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, extent_root, &key, path, - -1, 1); + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, + is_data); BUG_ON(ret); } - } - - if (refs == 0) { - u64 super_used; - u64 root_used; + } else { + int mark_free = 0; struct extent_buffer *must_clean = NULL; - if (pin) { - ret = pin_down_bytes(trans, root, path, - bytenr, num_bytes, - owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, - &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(root, path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } } - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - super_used - num_bytes); - - /* block accounting for root item */ - root_used = btrfs_root_used(&root->root_item); - btrfs_set_root_used(&root->root_item, - root_used - num_bytes); - spin_unlock(&info->delalloc_lock); - + ret = pin_down_bytes(trans, root, path, bytenr, + num_bytes, is_data, &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); /* * it is going to be very rare for someone to be waiting * on the block we're freeing. del_items might need to @@ -2373,7 +3303,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, free_extent_buffer(must_clean); } - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); } else { @@ -2391,34 +3321,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, } /* - * remove an extent from the root, returns 0 on success - */ -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, - int refs_to_drop) -{ - WARN_ON(num_bytes < root->sectorsize); - - /* - * if metadata always pin - * if data pin when any transaction has committed this - */ - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || - ref_generation != trans->transid) - pin = 1; - - if (ref_generation != trans->transid) - pin = 1; - - return __free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin, pin == 0, refs_to_drop); -} - -/* * when we free an extent, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it @@ -2449,6 +3351,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (ref->bytenr == bytenr) goto out; + if (head->extent_op) { + if (!head->must_insert_reserved) + goto out; + kfree(head->extent_op); + head->extent_op = NULL; + } + /* * waiting for the lock here would deadlock. If someone else has it * locked they are already in the process of dropping it anyway @@ -2477,7 +3386,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->must_insert_reserved); + &head->node, head->extent_op, + head->must_insert_reserved); BUG_ON(ret); btrfs_put_delayed_ref(&head->node); return 0; @@ -2489,32 +3399,32 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin) + u64 root_objectid, u64 owner, u64 offset) { int ret; /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. - * - * data extents referenced by the tree log do need to have - * their reference counts bumped. */ - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); ret = 0; - } else { - ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, - BTRFS_DROP_DELAYED_REF, 1); + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); ret = check_ref_cleanup(trans, root, bytenr); BUG_ON(ret); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); } return ret; } @@ -2592,7 +3502,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start); if (block_group && block_group_bits(block_group, data)) { down_read(&space_info->groups_sem); - goto have_block_group; + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else + goto have_block_group; } else if (block_group) { btrfs_put_block_group(block_group); } @@ -2626,6 +3547,13 @@ have_block_group: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); + if (last_ptr->block_group && + (last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data))) { + offset = 0; + goto refill_cluster; + } + offset = btrfs_alloc_from_cluster(block_group, last_ptr, num_bytes, search_start); if (offset) { @@ -2651,10 +3579,17 @@ have_block_group: last_ptr_loop = 1; search_start = block_group->key.objectid; + /* + * we know this block group is properly + * in the list because + * btrfs_remove_block_group, drops the + * cluster before it removes the block + * group from the list + */ goto have_block_group; } spin_unlock(&last_ptr->lock); - +refill_cluster: /* * this cluster didn't work out, free it and * start over @@ -2664,7 +3599,7 @@ have_block_group: last_ptr_loop = 0; /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, + ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, offset, num_bytes, empty_cluster + empty_size); @@ -2798,9 +3733,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", info->total_bytes, - info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, - info->bytes_used); + " may_use=%llu, used=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -2911,99 +3849,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, return ret; } -static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins, - int ref_mod) +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) { int ret; - u64 super_used; - u64 root_used; - u64 num_bytes = ins->offset; - u32 sizes[2]; - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_extent_item *extent_item; - struct btrfs_extent_ref *ref; + struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; - struct btrfs_key keys[2]; - - if (parent == 0) - parent = ins->objectid; - - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); + struct extent_buffer *leaf; + int type; + u32 size; - /* block accounting for root item */ - root_used = btrfs_root_used(&root->root_item); - btrfs_set_root_used(&root->root_item, root_used + num_bytes); - spin_unlock(&info->delalloc_lock); + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; - memcpy(&keys[0], ins, sizeof(*ins)); - keys[1].objectid = ins->objectid; - keys[1].type = BTRFS_EXTENT_REF_KEY; - keys[1].offset = parent; - sizes[0] = sizeof(*extent_item); - sizes[1] = sizeof(*ref); + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); BUG_ON(!path); path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, extent_root, path, keys, - sizes, 2); + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); BUG_ON(ret); - extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_extent_ref); - - btrfs_set_ref_root(path->nodes[0], ref, root_objectid); - btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); - btrfs_set_ref_objectid(path->nodes[0], ref, owner); - btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } btrfs_mark_buffer_dirty(path->nodes[0]); - - trans->alloc_exclude_start = 0; - trans->alloc_exclude_nr = 0; btrfs_free_path(path); - if (ret) - goto out; - - ret = update_block_group(trans, root, ins->objectid, - ins->offset, 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); BUG(); } -out: return ret; } -int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) - return 0; + path = btrfs_alloc_path(); + BUG_ON(!path); - ret = btrfs_add_delayed_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - ref_generation, owner, - BTRFS_ADD_DELAYED_EXTENT, 0); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); BUG_ON(ret); + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins) +{ + int ret; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, + 0, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL); return ret; } @@ -3012,10 +3998,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, * an extent has been allocated and makes sure to clear the free * space cache bits as well */ -int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) { int ret; struct btrfs_block_group_cache *block_group; @@ -3029,8 +4015,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ins->offset); BUG_ON(ret); btrfs_put_block_group(block_group); - ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins, 1); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); return ret; } @@ -3041,26 +4027,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, * * returns 0 if everything worked, non-zero otherwise. */ -int btrfs_alloc_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 min_alloc_size, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, u64 data) +static int alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 empty_size, u64 hint_byte, u64 search_end, + struct btrfs_key *ins) { int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, - min_alloc_size, empty_size, hint_byte, - search_end, ins, data); + u64 flags = 0; + + ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); BUG_ON(ret); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins->objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - ref_generation, owner_objectid, - BTRFS_ADD_DELAYED_EXTENT, 0); + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); BUG_ON(ret); } - update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -3099,21 +4107,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, u64 parent, - u64 root_objectid, - u64 ref_generation, - int level, - u64 hint, - u64 empty_size) + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size) { struct btrfs_key ins; int ret; struct extent_buffer *buf; - ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, - root_objectid, ref_generation, level, - empty_size, hint, (u64)-1, &ins, 0); + ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, + key, level, empty_size, hint, (u64)-1, &ins); if (ret) { BUG_ON(ret > 0); return ERR_PTR(ret); @@ -3127,32 +4131,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { - u64 leaf_owner; - u64 leaf_generation; - struct refsort *sorted; + u64 disk_bytenr; + u64 num_bytes; struct btrfs_key key; struct btrfs_file_extent_item *fi; + u32 nritems; int i; - int nritems; int ret; - int refi = 0; - int slot; BUG_ON(!btrfs_is_leaf(leaf)); nritems = btrfs_header_nritems(leaf); - leaf_owner = btrfs_header_owner(leaf); - leaf_generation = btrfs_header_generation(leaf); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); - /* we do this loop twice. The first time we build a list - * of the extents we have a reference on, then we sort the list - * by bytenr. The second time around we actually do the - * extent freeing. - */ for (i = 0; i < nritems; i++) { - u64 disk_bytenr; cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); /* only extents have references, skip everything else */ @@ -3172,45 +4163,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, if (disk_bytenr == 0) continue; - sorted[refi].bytenr = disk_bytenr; - sorted[refi].slot = i; - refi++; - } - - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - for (i = 0; i < refi; i++) { - u64 disk_bytenr; - - disk_bytenr = sorted[i].bytenr; - slot = sorted[i].slot; - - cond_resched(); - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - - ret = btrfs_free_extent(trans, root, disk_bytenr, - btrfs_file_extent_disk_num_bytes(leaf, fi), - leaf->start, leaf_owner, leaf_generation, - key.objectid, 0); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, + leaf->start, 0, key.objectid, 0); BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); } -out: - kfree(sorted); return 0; } +#if 0 + static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_leaf_ref *ref) @@ -3253,13 +4215,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } + static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); + ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -3294,6 +4257,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, return ret; } + /* * this is used while deleting old snapshots, and it drops the refs * on a whole subtree starting from a level 1 node. @@ -3587,32 +4551,36 @@ out: cond_resched(); return 0; } +#endif /* * helper function for drop_subtree, this function is similar to * walk_down_tree. The main difference is that it checks reference * counts while tree blocks are locked. */ -static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) { struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; u64 bytenr; u64 ptr_gen; + u64 refs; + u64 flags; u32 blocksize; - u32 refs; int ret; cur = path->nodes[*level]; - ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, - &refs); + ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len, + &refs, &flags); BUG_ON(ret); if (refs > 1) goto out; + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + while (*level >= 0) { cur = path->nodes[*level]; if (*level == 0) { @@ -3634,16 +4602,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, - &refs); + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); BUG_ON(ret); if (refs > 1) { parent = path->nodes[*level]; ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - *level - 1, 1); + blocksize, parent->start, + btrfs_header_owner(parent), + *level - 1, 0); BUG_ON(ret); path->slots[*level]++; btrfs_tree_unlock(next); @@ -3651,6 +4618,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, continue; } + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + *level = btrfs_header_level(next); path->nodes[*level] = next; path->slots[*level] = 0; @@ -3658,13 +4627,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, cond_resched(); } out: - parent = path->nodes[*level + 1]; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; bytenr = path->nodes[*level]->start; blocksize = path->nodes[*level]->len; - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, btrfs_header_owner(parent), - btrfs_header_generation(parent), *level, 1); + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, + btrfs_header_owner(parent), *level, 0); BUG_ON(ret); if (path->locks[*level]) { @@ -3688,8 +4659,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, int max_level) { - u64 root_owner; - u64 root_gen; struct btrfs_root_item *root_item = &root->root_item; int i; int slot; @@ -3697,24 +4666,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, for (i = *level; i < max_level && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { - struct extent_buffer *node; - struct btrfs_disk_key disk_key; - + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { /* * there is more work to do in this level. * Update the drop_progress marker to reflect * the work we've done so far, and then bump * the slot number */ - node = path->nodes[i]; path->slots[i]++; - *level = i; WARN_ON(*level == 0); - btrfs_node_key(node, &disk_key, path->slots[i]); - memcpy(&root_item->drop_progress, - &disk_key, sizeof(disk_key)); - root_item->drop_level = i; + if (max_level == BTRFS_MAX_LEVEL) { + btrfs_node_key(path->nodes[i], + &root_item->drop_progress, + path->slots[i]); + root_item->drop_level = i; + } + *level = i; return 0; } else { struct extent_buffer *parent; @@ -3728,22 +4695,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, else parent = path->nodes[*level + 1]; - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - clean_tree_block(trans, root, path->nodes[*level]); + clean_tree_block(trans, root, path->nodes[i]); ret = btrfs_free_extent(trans, root, - path->nodes[*level]->start, - path->nodes[*level]->len, - parent->start, root_owner, - root_gen, *level, 1); + path->nodes[i]->start, + path->nodes[i]->len, + parent->start, + btrfs_header_owner(parent), + *level, 0); BUG_ON(ret); if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[*level]); - path->locks[*level] = 0; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; *level = i + 1; } } @@ -3762,21 +4727,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root int wret; int level; struct btrfs_path *path; - int i; - int orig_level; int update_count; struct btrfs_root_item *root_item = &root->root_item; - WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); path = btrfs_alloc_path(); BUG_ON(!path); level = btrfs_header_level(root->node); - orig_level = level; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - path->nodes[level] = root->node; - extent_buffer_get(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; + path->locks[level] = 1; } else { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -3798,12 +4760,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (path->nodes[i] && path->locks[i]) { - path->locks[i] = 0; - btrfs_tree_unlock(path->nodes[i]); - } - } + btrfs_unlock_up_safe(path, 0); } while (1) { unsigned long update; @@ -3824,8 +4781,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EAGAIN; break; } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); for (update_count = 0; update_count < 16; update_count++) { update = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -3835,12 +4790,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root break; } } - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } out: btrfs_free_path(path); return ret; @@ -3873,7 +4822,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_subtree(trans, root, path, &level); + wret = walk_down_tree(trans, root, path, &level); if (wret < 0) ret = wret; if (wret != 0) @@ -3890,6 +4839,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } +#if 0 static unsigned long calc_ra(unsigned long start, unsigned long last, unsigned long nr) { @@ -5371,6 +6321,7 @@ out: kfree(ref_path); return ret; } +#endif static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { @@ -5419,7 +6370,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, u64 calc; spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + if (btrfs_block_group_used(&shrink_block_group->item) + + shrink_block_group->reserved > 0) { spin_unlock(&shrink_block_group->lock); trans = btrfs_start_transaction(root, 1); @@ -5444,6 +6396,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, return 0; } + +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group) + +{ + __alloc_chunk_for_shrink(root, group, 1); + set_block_group_readonly(group); + return 0; +} + +#if 0 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 size) @@ -5723,6 +6686,7 @@ out: btrfs_free_path(path); return ret; } +#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) @@ -5935,6 +6899,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; struct btrfs_key key; int ret; @@ -5946,6 +6911,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + path = btrfs_alloc_path(); BUG_ON(!path); @@ -5955,7 +6935,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); spin_lock(&block_group->space_info->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb2bee8b7fb..68260180f58 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,12 +17,6 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -50,20 +44,23 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - int extent_locked; + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; }; int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; @@ -479,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + u64 last_end; int err; int set = 0; @@ -501,6 +499,7 @@ again: if (state->start > end) goto out; WARN_ON(state->end < start); + last_end = state->end; /* * | ---- desired range ---- | @@ -527,9 +526,11 @@ again: if (err) goto out; if (state->end <= end) { - start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } else { start = state->start; } @@ -555,8 +556,10 @@ again: goto out; } - start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; goto search_again; out: @@ -710,8 +713,10 @@ again: goto out; } set_state_bits(tree, state, bits); - start = state->end + 1; merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; goto search_again; } @@ -745,8 +750,10 @@ again: goto out; if (state->end <= end) { set_state_bits(tree, state, bits); - start = state->end + 1; merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } else { start = state->start; } @@ -1404,69 +1411,6 @@ out: return total_bytes; } -#if 0 -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} - -/* - * helper function to unlock both pages and extents in the tree. - */ -static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -#endif - /* * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. @@ -2101,6 +2045,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + /* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas @@ -2136,8 +2090,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 delalloc_end; int page_started; int compressed; + int write_flags; unsigned long nr_written = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -2164,6 +2124,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + while (delalloc_end < page_end) { nr_delalloc = find_lock_delalloc_range(inode, tree, page, @@ -2185,7 +2151,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (page_started) { ret = 0; - goto update_nr_written; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; } } lock_extent(tree, start, page_end, GFP_NOFS); @@ -2198,13 +2170,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; - goto update_nr_written; + goto done_unlocked; } } - nr_written++; + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); end = page_end; if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) @@ -2314,9 +2291,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, (unsigned long long)end); } - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, pg_offset, bdev, - &epd->bio, max_nr, + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0); if (ret) @@ -2336,11 +2313,8 @@ done: unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); -update_nr_written: - wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; +done_unlocked: + return 0; } @@ -2460,15 +2434,23 @@ retry: return ret; } -static noinline void flush_write_bio(void *data) +static void flush_epd_write_bio(struct extent_page_data *epd) { - struct extent_page_data *epd = data; if (epd->bio) { - submit_one_bio(WRITE, epd->bio, 0, 0); + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); epd->bio = NULL; } } +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + int extent_write_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct writeback_control *wbc) @@ -2480,23 +2462,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, }; - ret = __extent_writepage(page, wbc, &epd); extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2515,6 +2496,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .tree = tree, .get_extent = get_extent, .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = inode->i_mapping->backing_dev_info, @@ -2540,8 +2522,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_CACHE_SIZE; } - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2556,13 +2537,13 @@ int extent_writepages(struct extent_io_tree *tree, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b187917b36f..30c9365861e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -6,19 +6,14 @@ #include <linux/hardirq.h> #include "extent_map.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), 0, - NULL); + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) tree->map.rb_node = NULL; spin_lock_init(&tree->lock); } -EXPORT_SYMBOL(extent_map_tree_init); /** * alloc_extent_map - allocate new extent map structure @@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask) atomic_set(&em->refs, 1); return em; } -EXPORT_SYMBOL(alloc_extent_map); /** * free_extent_map - drop reference count of an extent_map @@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em) kmem_cache_free(extent_map_cache, em); } } -EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) @@ -264,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree, out: return ret; } -EXPORT_SYMBOL(add_extent_mapping); /* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) @@ -326,7 +317,6 @@ found: out: return em; } -EXPORT_SYMBOL(lookup_extent_mapping); /** * remove_extent_mapping - removes an extent_map from the extent tree @@ -346,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) em->in_tree = 0; return ret; } -EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c9fb46ccd0..126477eaecf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_file(struct btrfs_root *root, struct inode *inode) -{ - return 0; -#if 0 - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - u64 last_offset = 0; - int nritems; - int slot; - int found_type; - int ret; - int err = 0; - u64 extent_end = 0; - - path = btrfs_alloc_path(); - ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, - last_offset, 0); - while (1) { - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - nritems = btrfs_header_nritems(path->nodes[0]); - } - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != inode->i_ino) - break; - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto out; - - if (found_key.offset < last_offset) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu " - "expected %llu\n", inode->i_ino, - (unsigned long long)found_key.offset, - (unsigned long long)last_offset); - err = 1; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, extent); - extent_end = (extent_end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - last_offset = extent_end; - path->slots[0]++; - } - if (0 && last_offset < inode->i_size) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu size %llu\n", - inode->i_ino, (unsigned long long)last_offset, - (unsigned long long)inode->i_size); - err = 1; - - } -out: - btrfs_free_path(path); - return err; -#endif -} - /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -363,20 +286,17 @@ out: */ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_byte) + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_byte) { u64 extent_end = 0; - u64 locked_end = end; u64 search_start = start; - u64 leaf_start; u64 ram_bytes = 0; - u64 orig_parent = 0; u64 disk_bytenr = 0; + u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; - u64 root_gen; - u64 root_owner; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; struct btrfs_path *path; @@ -416,9 +336,6 @@ next_slot: bookend = 0; found_extent = 0; found_inline = 0; - leaf_start = 0; - root_gen = 0; - root_owner = 0; compression = 0; encryption = 0; extent = NULL; @@ -493,9 +410,6 @@ next_slot: if (found_extent) { read_extent_buffer(leaf, &old, (unsigned long)extent, sizeof(old)); - root_gen = btrfs_header_generation(leaf); - root_owner = btrfs_header_owner(leaf); - leaf_start = leaf->start; } if (end < extent_end && end >= key.offset) { @@ -519,14 +433,14 @@ next_slot: } locked_end = extent_end; } - orig_parent = path->nodes[0]->start; disk_bytenr = le64_to_cpu(old.disk_bytenr); if (disk_bytenr != 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - orig_parent, root->root_key.objectid, - trans->transid, inode->i_ino); + le64_to_cpu(old.disk_num_bytes), 0, + root->root_key.objectid, + key.objectid, key.offset - + le64_to_cpu(old.offset)); BUG_ON(ret); } } @@ -644,17 +558,6 @@ next_slot: btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_set_lock_blocking(path->nodes[0]); - if (disk_bytenr != 0) { - ret = btrfs_update_extent_ref(trans, root, - disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - orig_parent, - leaf->start, - root->root_key.objectid, - trans->transid, ins.objectid); - - BUG_ON(ret); - } path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) @@ -670,8 +573,9 @@ next_slot: ret = btrfs_free_extent(trans, root, old_disk_bytenr, le64_to_cpu(old.disk_num_bytes), - leaf_start, root_owner, - root_gen, key.objectid, 0); + 0, root->root_key.objectid, + key.objectid, key.offset - + le64_to_cpu(old.offset)); BUG_ON(ret); *hint_byte = old_disk_bytenr; } @@ -684,11 +588,10 @@ next_slot: } out: btrfs_free_path(path); - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (locked_end > orig_locked_end) { + unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, + locked_end - 1, GFP_NOFS); } - btrfs_check_file(root, inode); return ret; } @@ -741,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, u64 bytenr; u64 num_bytes; u64 extent_end; - u64 extent_offset; + u64 orig_offset; u64 other_start; u64 other_end; u64 split = start; u64 locked_end = end; - u64 orig_parent; int extent_type; int split_end = 1; int ret; @@ -780,7 +682,7 @@ again: bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); if (key.offset == start) split = end; @@ -788,8 +690,6 @@ again: if (key.offset == start && extent_end == end) { int del_nr = 0; int del_slot = 0; - u64 leaf_owner = btrfs_header_owner(leaf); - u64 leaf_gen = btrfs_header_generation(leaf); other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, @@ -798,8 +698,8 @@ again: del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - leaf->start, leaf_owner, - leaf_gen, inode->i_ino, 0); + 0, root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); } other_start = 0; @@ -810,8 +710,8 @@ again: del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - leaf->start, leaf_owner, - leaf_gen, inode->i_ino, 0); + 0, root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); } split_end = 0; @@ -830,7 +730,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); BUG_ON(ret); - goto done; + goto release; } else if (split == start) { if (locked_end < extent_end) { ret = try_lock_extent(&BTRFS_I(inode)->io_tree, @@ -845,13 +745,12 @@ again: locked_end = extent_end; } btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); - extent_offset += split - key.offset; } else { BUG_ON(key.offset != start); - btrfs_set_file_extent_offset(leaf, fi, extent_offset + - split - key.offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); key.offset = split; + btrfs_set_file_extent_offset(leaf, fi, key.offset - + orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); btrfs_set_item_key_safe(trans, root, path, &key); extent_end = split; } @@ -870,7 +769,8 @@ again: struct btrfs_file_extent_item); key.offset = split; btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_offset(leaf, fi, key.offset - + orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - split); goto done; @@ -892,10 +792,9 @@ again: btrfs_mark_buffer_dirty(leaf); - orig_parent = leaf->start; - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, root->root_key.objectid, - trans->transid, inode->i_ino); + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); btrfs_release_path(root, path); @@ -910,22 +809,16 @@ again: btrfs_set_file_extent_type(leaf, fi, extent_type); btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_compression(leaf, fi, 0); btrfs_set_file_extent_encryption(leaf, fi, 0); btrfs_set_file_extent_other_encoding(leaf, fi, 0); - - if (orig_parent != leaf->start) { - ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, leaf->start, - root->root_key.objectid, - trans->transid, inode->i_ino); - BUG_ON(ret); - } done: btrfs_mark_buffer_dirty(leaf); + +release: btrfs_release_path(root, path); if (split_end && split == start) { split = end; @@ -1131,7 +1024,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, pos + write_bytes - 1, - WB_SYNC_NONE); + WB_SYNC_ALL); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); @@ -1264,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_wait_ordered_range(inode, 0, (u64)-1); root->log_batch++; + if (datasync && !(inode->i_state & I_DIRTY_PAGES)) + goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 768b9523662..4538e48581a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, printk(KERN_ERR "couldn't find space %llu to free\n", (unsigned long long)offset); printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, block_group->key.objectid, - block_group->key.offset); + block_group->cached, + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->key.offset); btrfs_dump_free_space(block_group, bytes); } else if (info) { printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " "but wanted offset=%llu bytes=%llu\n", - info->offset, info->bytes, offset, bytes); + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); } WARN_ON(1); } @@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, - info->bytes); + printk(KERN_ERR "entry offset %llu, bytes %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); @@ -574,6 +579,7 @@ out: * it returns -enospc */ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) @@ -590,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, int ret; /* for metadata, allow allocates with more holes */ - if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { /* * we want to do larger allocations when we are * flushing out the delayed refs, it helps prevent @@ -640,14 +648,15 @@ again: * we haven't filled the empty size and the window is * very large. reset and try again */ - if (next->offset - window_start > (bytes + empty_size) * 2) { + if (next->offset - (last->offset + last->bytes) > 128 * 1024 || + next->offset - window_start > (bytes + empty_size) * 2) { entry = next; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = 0; total_retries++; - if (total_retries % 256 == 0) { + if (total_retries % 64 == 0) { if (min_bytes >= (bytes + empty_size)) { ret = -ENOSPC; goto out; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ab0bdc0a63c..266fb876405 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 2a020b27676..db2ff9773b9 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,9 +19,9 @@ #ifndef __HASH__ #define __HASH__ -#include "crc32c.h" +#include <linux/crc32c.h> static inline u64 btrfs_name_hash(const char *name, int len) { - return btrfs_crc32c((u32)~1, name, len); + return crc32c((u32)~1, name, len); } #endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc7334d833c..9abbced1123 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); BUG_ON(!path); - search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; search_key.type = 0; search_key.offset = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0d1dd492a5..8612b3a0981 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -48,7 +48,6 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "ref-cache.h" #include "compression.h" #include "locking.h" @@ -70,7 +69,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -234,7 +232,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -370,7 +368,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!btrfs_test_flag(inode, NOCOMPRESS) && + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && btrfs_test_opt(root, COMPRESS)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -471,7 +469,7 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - btrfs_set_flag(inode, NOCOMPRESS); + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -864,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (btrfs_test_flag(inode, NOCOMPRESS)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -945,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 cow_start; u64 cur_offset; u64 extent_end; + u64 extent_offset; u64 disk_bytenr; u64 num_bytes; int extent_type; @@ -1006,6 +1005,7 @@ next_slot: if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (extent_end <= start) { @@ -1023,9 +1023,10 @@ next_slot: if (btrfs_extent_readonly(root, disk_bytenr)) goto out_check; if (btrfs_cross_ref_exist(trans, root, inode->i_ino, - disk_bytenr)) + found_key.offset - + extent_offset, disk_bytenr)) goto out_check; - disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* @@ -1132,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (btrfs_test_flag(inode, NODATACOW)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (btrfs_test_flag(inode, PREALLOC)) + else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS)) @@ -1289,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; - skip_sum = btrfs_test_flag(inode, NODATASUM); + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); @@ -1439,6 +1440,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1455,7 +1457,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1488,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, - root->root_key.objectid, - trans->transid, inode->i_ino, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, + root->root_key.objectid, + inode->i_ino, file_pos, &ins); BUG_ON(ret); btrfs_free_path(path); @@ -1590,6 +1593,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -1785,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, ClearPageChecked(page); goto good; } - if (btrfs_test_flag(inode, NODATASUM)) + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && @@ -1819,10 +1825,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); @@ -1951,23 +1959,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ - inode = btrfs_iget_locked(root->fs_info->sb, - found_key.offset, root); - if (!inode) + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &found_key, root); + if (IS_ERR(inode)) break; - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - - /* have to set the location manually */ - BTRFS_I(inode)->location.objectid = inode->i_ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it @@ -2011,9 +2009,60 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) } /* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* * read an inode from the btree into the in-memory inode */ -void btrfs_read_locked_inode(struct inode *inode) +static void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -2021,6 +2070,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2067,6 +2117,16 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) { + BTRFS_I(inode)->i_acl = NULL; + BTRFS_I(inode)->i_default_acl = NULL; + } + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2097,6 +2157,8 @@ void btrfs_read_locked_inode(struct inode *inode) init_special_inode(inode, inode->i_mode, rdev); break; } + + btrfs_update_iflags(inode); return; make_bad: @@ -2260,7 +2322,6 @@ err: btrfs_update_inode(trans, root, dir); btrfs_drop_nlink(inode); ret = btrfs_update_inode(trans, root, inode); - dir->i_sb->s_dirt = 1; out: return ret; } @@ -2532,9 +2593,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; u64 extent_start = 0; u64 extent_num_bytes = 0; + u64 extent_offset = 0; u64 item_end = 0; - u64 root_gen = 0; - u64 root_owner = 0; int found_extent; int del_item; int pending_del_nr = 0; @@ -2649,6 +2709,9 @@ search_again: extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + /* FIXME blocksize != 4096 */ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { @@ -2656,8 +2719,6 @@ search_again: if (root->ref_cows) inode_sub_bytes(inode, num_dec); } - root_gen = btrfs_header_generation(leaf); - root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* @@ -2701,12 +2762,12 @@ delete: } else { break; } - if (found_extent) { + if (found_extent && root->ref_cows) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, - leaf->start, root_owner, - root_gen, inode->i_ino, 0); + extent_num_bytes, 0, + btrfs_header_owner(leaf), + inode->i_ino, extent_offset); BUG_ON(ret); } next: @@ -2744,7 +2805,6 @@ error: pending_del_nr); } btrfs_free_path(path); - inode->i_sb->s_dirt = 1; return ret; } @@ -2877,6 +2937,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -3037,12 +3098,51 @@ static int fixup_tree_root_location(struct btrfs_root *root, return 0; } +static void inode_tree_add(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *entry; + struct rb_node **p = &root->inode_tree.rb_node; + struct rb_node *parent = NULL; + + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (inode->i_ino < entry->vfs_inode.i_ino) + p = &(*p)->rb_left; + else if (inode->i_ino > entry->vfs_inode.i_ino) + p = &(*p)->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING | I_CLEAR))); + break; + } + } + rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); + rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + spin_lock(&root->inode_lock); + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + } +} + static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; + bi->i_acl = BTRFS_ACL_NOT_CACHED; + bi->i_default_acl = BTRFS_ACL_NOT_CACHED; bi->generation = 0; bi->sequence = 0; @@ -3054,6 +3154,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->flags = 0; bi->index_cnt = (u64)-1; bi->last_unlink_trans = 0; + bi->ordered_data_close = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); @@ -3061,6 +3162,7 @@ static noinline void init_btrfs_i(struct inode *inode) inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -3083,26 +3185,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, - struct btrfs_root *root, int wait) -{ - struct inode *inode; - struct btrfs_iget_args args; - args.ino = objectid; - args.root = root; - - if (wait) { - inode = ilookup5(s, objectid, btrfs_find_actor, - (void *)&args); - } else { - inode = ilookup5_nowait(s, objectid, btrfs_find_actor, - (void *)&args); - } - return inode; -} - -struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(struct super_block *s, + u64 objectid, + struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -3119,24 +3204,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, * Returns in *is_new if the inode was read from disk */ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *is_new) + struct btrfs_root *root) { struct inode *inode; inode = btrfs_iget_locked(s, location->objectid, root); if (!inode) - return ERR_PTR(-EACCES); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); + + inode_tree_add(inode); unlock_new_inode(inode); - if (is_new) - *is_new = 1; - } else { - if (is_new) - *is_new = 0; } return inode; @@ -3149,7 +3231,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *root = bi->root; struct btrfs_root *sub_root = root; struct btrfs_key location; - int ret, new; + int ret; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -3167,7 +3249,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); - inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + inode = btrfs_iget(dir->i_sb, &location, sub_root); if (IS_ERR(inode)) return ERR_CAST(inode); } @@ -3505,9 +3587,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_find_block_group(root, 0, alloc_hint, owner); if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) - btrfs_set_flag(inode, NODATASUM); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - btrfs_set_flag(inode, NODATACOW); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } key[0].objectid = objectid; @@ -3561,7 +3643,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location->offset = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + btrfs_inherit_iflags(inode, dir); + insert_inode_hash(inode); + inode_tree_add(inode); return inode; fail: if (dir) @@ -3681,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3746,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3793,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) drop_inode = 1; - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); @@ -3875,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); @@ -4227,7 +4308,6 @@ out: } if (err) { free_extent_map(em); - WARN_ON(1); return ERR_PTR(err); } return em; @@ -4615,6 +4695,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -4634,47 +4715,36 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -4915,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); if (drop_inode) @@ -4970,10 +5039,10 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) +static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, + u64 locked_end, u64 alloc_hint, int mode) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; @@ -4981,10 +5050,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, u64 num_bytes = end - start; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, alloc_size, @@ -4997,7 +5062,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -5007,7 +5073,7 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, out: if (cur_offset > start) { inode->i_ctime = CURRENT_TIME; - btrfs_set_flag(inode, PREALLOC); + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && cur_offset > i_size_read(inode)) btrfs_i_size_write(inode, cur_offset); @@ -5015,7 +5081,6 @@ out: BUG_ON(ret); } - btrfs_end_transaction(trans, root); return ret; } @@ -5027,13 +5092,21 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + struct btrfs_trans_handle *trans; int ret; alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + mutex_lock(&inode->i_mutex); if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); @@ -5041,10 +5114,21 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (!trans) { + ret = -EIO; + goto out; + } + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -5052,7 +5136,13 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); } else { @@ -5070,8 +5160,9 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(inode, cur_offset, - last_byte, alloc_hint, mode); + ret = prealloc_file_range(trans, inode, cur_offset, + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -5087,8 +5178,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); + + btrfs_end_transaction(trans, BTRFS_I(inode)->root); out: mutex_unlock(&inode->i_mutex); return ret; @@ -5101,7 +5194,7 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { - if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, btrfs_check_acl); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7594bec1be1..eff18f5b536 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,7 +50,177 @@ #include "volumes.h" #include "locking.h" +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + + if (ip->flags & BTRFS_INODE_SYNC) + inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; +} + +/* + * Inherit flags from the parent inode. + * + * Unlike extN we don't have any flags we don't want to inherit currently. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; + + if (S_ISREG(inode->i_mode)) + flags &= ~BTRFS_INODE_DIRSYNC; + else if (!S_ISDIR(inode->i_mode)) + flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + + BTRFS_I(inode)->flags = flags; + btrfs_update_iflags(inode); +} +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL)) + return -EOPNOTSUPP; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + mutex_lock(&inode->i_mutex); + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out_unlock; + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_end_transaction(trans, root); + + mnt_drop_write(file->f_path.mnt); + out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + return put_user(inode->i_generation, arg); +} static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, @@ -82,22 +252,25 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - objectid, trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; } - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); write_extent_buffer(leaf, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item.inode; @@ -125,7 +298,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_dirid(&root_item, new_dirid); key.objectid = objectid; - key.offset = 1; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); @@ -437,10 +610,6 @@ out_unlock: return 0; } -/* - * Called inside transaction, so use GFP_NOFS - */ - static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) { u64 new_size; @@ -461,15 +630,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -483,11 +646,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", devid); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", devid); + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_unlock; } @@ -545,7 +710,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) out_unlock: mutex_unlock(&root->fs_info->volume_mutex); -out: kfree(vol_args); return ret; } @@ -565,15 +729,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -675,19 +833,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -703,19 +855,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -830,7 +976,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + btrfs_drop_extents(trans, root, inode, off, off + len, + off + len, 0, &hint_byte); /* clone data */ key.objectid = src->i_ino; @@ -937,10 +1084,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, - disko, diskl, leaf->start, - root->root_key.objectid, - trans->transid, - inode->i_ino); + disko, diskl, 0, + root->root_key.objectid, + inode->i_ino, + new_key.offset - datao); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -1100,6 +1247,12 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 53c87b197d7..d6f0806c682 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -489,7 +489,7 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5f8f218c100..6d6523da0a3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb, (unsigned long long)btrfs_device_total_bytes(eb, dev_item), (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); } +static void print_extent_data_ref(struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + printk(KERN_INFO "\t\textent data backref root %llu " + "objectid %llu offset %llu count %u\n", + (unsigned long long)btrfs_extent_data_ref_root(eb, ref), + (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), + (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + int type; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 flags; + u64 offset; + + if (item_size < sizeof(*ei)) { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); + printk(KERN_INFO "\t\textent refs %u\n", + btrfs_extent_refs_v0(eb, ei0)); + return; +#else + BUG(); +#endif + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", + (unsigned long long)btrfs_extent_refs(eb, ei), + (unsigned long long)btrfs_extent_generation(eb, ei), + (unsigned long long)flags); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + "level %d\n", + (unsigned long long)btrfs_disk_key_objectid(&key), + key.type, + (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref " + "root %llu\n", (unsigned long long)offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref " + "parent %llu\n", (unsigned long long)offset); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + printk(KERN_INFO "\t\tshared data backref " + "parent %llu count %u\n", + (unsigned long long)offset, + btrfs_shared_data_ref_count(eb, sref)); + break; + default: + BUG(); + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static void print_extent_ref_v0(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_ref_v0 *ref0; + + ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); + printk("\t\textent back ref root %llu gen %llu " + "owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root_v0(eb, ref0), + (unsigned long long)btrfs_ref_generation_v0(eb, ref0), + (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + (unsigned long)btrfs_ref_count_v0(eb, ref0)); +} +#endif + void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; + u32 type; u32 nr = btrfs_header_nritems(l); struct btrfs_item *item; - struct btrfs_extent_item *ei; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_extent_ref *ref; - struct btrfs_dev_extent *dev_extent; - u32 type; printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, @@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); - printk(KERN_INFO "\t\textent data refs %u\n", - btrfs_extent_refs(l, ei)); - break; - case BTRFS_EXTENT_REF_KEY: - ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); - printk(KERN_INFO "\t\textent back ref root %llu " - "gen %llu owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root(l, ref), - (unsigned long long)btrfs_ref_generation(l, ref), - (unsigned long long)btrfs_ref_objectid(l, ref), - (unsigned long)btrfs_ref_num_refs(l, ref)); + print_extent_item(l, i); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + printk(KERN_INFO "\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); break; - case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); @@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; + case BTRFS_EXTENT_REF_V0_KEY: +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + print_extent_ref_v0(l, i); +#else + BUG(); +#endif case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c new file mode 100644 index 00000000000..b23dc209ae1 --- /dev/null +++ b/fs/btrfs/relocation.c @@ -0,0 +1,3711 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" + +/* + * backref_node, mapping_node and tree_block start with this + */ +struct tree_entry { + struct rb_node rb_node; + u64 bytenr; +}; + +/* + * present a tree block in the backref cache + */ +struct backref_node { + struct rb_node rb_node; + u64 bytenr; + /* objectid tree block owner */ + u64 owner; + /* list of upper level blocks reference this block */ + struct list_head upper; + /* list of child blocks in the cache */ + struct list_head lower; + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* extent buffer got by COW the block */ + struct extent_buffer *eb; + /* level of tree block */ + unsigned int level:8; + /* 1 if the block is root of old snapshot */ + unsigned int old_root:1; + /* 1 if no child blocks in the cache */ + unsigned int lowest:1; + /* is the extent buffer locked */ + unsigned int locked:1; + /* has the block been processed */ + unsigned int processed:1; + /* have backrefs of this block been checked */ + unsigned int checked:1; +}; + +/* + * present a block pointer in the backref cache + */ +struct backref_edge { + struct list_head list[2]; + struct backref_node *node[2]; + u64 blockptr; +}; + +#define LOWER 0 +#define UPPER 1 + +struct backref_cache { + /* red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* list of backref nodes with no child block in the cache */ + struct list_head pending[BTRFS_MAX_LEVEL]; + spinlock_t lock; +}; + +/* + * map address of tree root to tree + */ +struct mapping_node { + struct rb_node rb_node; + u64 bytenr; + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct rb_node rb_node; + u64 bytenr; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +/* inode vector */ +#define INODEVEC_SIZE 16 + +struct inodevec { + struct list_head list; + struct inode *inode[INODEVEC_SIZE]; + int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + struct btrfs_workers workers; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + u64 search_start; + u64 extents_found; + u64 extents_skipped; + int stage; + int create_reloc_root; + unsigned int found_file_extent:1; + unsigned int found_old_snapshot:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +/* + * merge reloc tree to corresponding fs tree in worker threads + */ +struct async_merge { + struct btrfs_work work; + struct reloc_control *rc; + struct btrfs_root *root; + struct completion *done; + atomic_t *num_pending; +}; + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root.rb_node = NULL; + spin_lock_init(&tree->lock); +} + +static void backref_cache_init(struct backref_cache *cache) +{ + int i; + cache->rb_root.rb_node = NULL; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + spin_lock_init(&cache->lock); +} + +static void backref_node_init(struct backref_node *node) +{ + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct tree_entry *entry; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct backref_node *walk_up_backref(struct backref_node *node, + struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct backref_node *walk_down_backref(struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + struct backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void drop_node_buffer(struct backref_node *node) +{ + if (node->eb) { + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +static void drop_backref_node(struct backref_cache *tree, + struct backref_node *node) +{ + BUG_ON(!node->lowest); + BUG_ON(!list_empty(&node->upper)); + + drop_node_buffer(node); + list_del(&node->lower); + + rb_erase(&node->rb_node, &tree->rb_root); + kfree(node); +} + +/* + * remove a backref node from the backref cache + */ +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + struct backref_node *upper; + struct backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + kfree(edge); + /* + * add the node to pending list if no other + * child block cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, + &cache->pending[upper->level]); + upper->lowest = 1; + } + } + drop_backref_node(cache, node); +} + +/* + * find reloc tree by address of tree root + */ +static struct btrfs_root *find_reloc_root(struct reloc_control *rc, + u64 bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = (struct btrfs_root *)node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return root; +} + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_key key; + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(root_objectid)) + key.offset = 0; + else + key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &key); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static noinline_for_stack +struct btrfs_root *find_tree_root(struct reloc_control *rc, + struct extent_buffer *leaf, + struct btrfs_extent_ref_v0 *ref0) +{ + struct btrfs_root *root; + u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); + u64 generation = btrfs_ref_generation_v0(leaf, ref0); + + BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); + + root = read_fs_root(rc->extent_root->fs_info, root_objectid); + BUG_ON(IS_ERR(root)); + + if (root->ref_cows && + generation != btrfs_root_generation(&root->root_item)) + return NULL; + + return root; +} +#endif + +static noinline_for_stack +int find_inline_backref(struct extent_buffer *leaf, int slot, + unsigned long *ptr, unsigned long *end) +{ + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + u32 item_size; + + item_size = btrfs_item_size_nr(leaf, slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + return 1; + } +#endif + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + WARN_ON(!(btrfs_extent_flags(leaf, ei) & + BTRFS_EXTENT_FLAG_TREE_BLOCK)); + + if (item_size <= sizeof(*ei) + sizeof(*bi)) { + WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); + return 1; + } + + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + *end = (unsigned long)ei + item_size; + return 0; +} + +/* + * build backref tree for a given tree block. root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond + * roots of b-trees that reference the tree block. + * + * the basic idea of this function is check backrefs of a given block + * to find upper level blocks that refernece the block, and then check + * bakcrefs of these upper level blocks recursively. the recursion stop + * when tree root is reached or backrefs for the block is cached. + * + * NOTE: if we find backrefs for a block are cached, we know backrefs + * for all upper level blocks that directly/indirectly reference the + * block are also cached. + */ +static struct backref_node *build_backref_tree(struct reloc_control *rc, + struct backref_cache *cache, + struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct btrfs_path *path1; + struct btrfs_path *path2; + struct extent_buffer *eb; + struct btrfs_root *root; + struct backref_node *cur; + struct backref_node *upper; + struct backref_node *lower; + struct backref_node *node = NULL; + struct backref_node *exist = NULL; + struct backref_edge *edge; + struct rb_node *rb_node; + struct btrfs_key key; + unsigned long end; + unsigned long ptr; + LIST_HEAD(list); + int ret; + int err = 0; + + path1 = btrfs_alloc_path(); + path2 = btrfs_alloc_path(); + if (!path1 || !path2) { + err = -ENOMEM; + goto out; + } + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) { + err = -ENOMEM; + goto out; + } + + backref_node_init(node); + node->bytenr = bytenr; + node->owner = 0; + node->level = level; + node->lowest = 1; + cur = node; +again: + end = 0; + ptr = 0; + key.objectid = cur->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path1->search_commit_root = 1; + path1->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, + 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(!ret || !path1->slots[0]); + + path1->slots[0]--; + + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * the backref was added previously when processsing + * backref of type BTRFS_TREE_BLOCK_REF_KEY + */ + BUG_ON(!list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct backref_edge, + list[LOWER]); + BUG_ON(!list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * add the upper level block to pending list if we need + * check its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &list); + } else { + exist = NULL; + } + + while (1) { + cond_resched(); + eb = path1->nodes[0]; + + if (ptr >= end) { + if (path1->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + eb = path1->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); + if (key.objectid != cur->bytenr) { + WARN_ON(exist); + break; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = find_inline_backref(eb, path1->slots[0], + &ptr, &end); + if (ret) + goto next; + } + } + + if (ptr < end) { + /* update key for inline back ref */ + struct btrfs_extent_inline_ref *iref; + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && + key.type != BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + goto next; + } + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.objectid == key.offset && + key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(eb, path1->slots[0], + struct btrfs_extent_ref_v0); + root = find_tree_root(rc, eb, ref0); + if (root) + cur->root = root; + else + cur->old_root = 1; + break; + } +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { +#endif + if (key.objectid == key.offset) { + /* + * only root blocks of reloc trees use + * backref of this type. + */ + root = find_reloc_root(rc, cur->bytenr); + BUG_ON(!root); + cur->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + rb_node = tree_search(&cache->rb_root, key.offset); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = key.offset; + upper->owner = 0; + upper->level = cur->level + 1; + /* + * backrefs for the upper level block isn't + * cached, add the block to pending list + */ + list_add_tail(&edge->list[UPPER], &list); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add(&edge->list[LOWER], &cur->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = cur; + + goto next; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + goto next; + } + + /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + root = read_fs_root(rc->extent_root->fs_info, key.offset); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* tree root */ + BUG_ON(btrfs_root_bytenr(&root->root_item) != + cur->bytenr); + cur->root = root; + break; + } + + level = cur->level + 1; + + /* + * searching the tree to find upper level blocks + * reference the block. + */ + path2->search_commit_root = 1; + path2->skip_locking = 1; + path2->lowest_level = level; + ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); + path2->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + + eb = path2->nodes[level]; + WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr); + + lower = cur; + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path2->nodes[level]) { + BUG_ON(btrfs_root_bytenr(&root->root_item) != + lower->bytenr); + lower->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + + eb = path2->nodes[level]; + rb_node = tree_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = eb->start; + upper->owner = btrfs_header_owner(eb); + upper->level = lower->level + 1; + + /* + * if we know the block isn't shared + * we can void checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * add the block to pending list if we + * need check its backrefs. only block + * at 'cur->level + 1' is added to the + * tail of pending list. this guarantees + * we check backrefs from lower level + * blocks to upper level blocks. + */ + if (!upper->checked && + level == cur->level + 1) { + list_add_tail(&edge->list[UPPER], + &list); + } else + INIT_LIST_HEAD(&edge->list[UPPER]); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(!upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add_tail(&edge->list[LOWER], &lower->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = lower; + + if (rb_node) + break; + lower = upper; + upper = NULL; + } + btrfs_release_path(root, path2); +next: + if (ptr < end) { + ptr += btrfs_extent_inline_ref_size(key.type); + if (ptr >= end) { + WARN_ON(ptr > end); + ptr = 0; + end = 0; + } + } + if (ptr >= end) + path1->slots[0]++; + } + btrfs_release_path(rc->extent_root, path1); + + cur->checked = 1; + WARN_ON(exist); + + /* the pending list isn't empty, take the first block to process */ + if (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + goto again; + } + + /* + * everything goes well, connect backref nodes and insert backref nodes + * into the cache. + */ + BUG_ON(!node->checked); + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); + + list_for_each_entry(edge, &node->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + + while (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + BUG_ON(!upper->checked); + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + + list_add_tail(&edge->list[UPPER], &upper->lower); + + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + } +out: + btrfs_free_path(path1); + btrfs_free_path(path2); + if (err) { + INIT_LIST_HEAD(&list); + upper = node; + while (upper) { + if (RB_EMPTY_NODE(&upper->rb_node)) { + list_splice_tail(&upper->upper, &list); + kfree(upper); + } + + if (list_empty(&list)) + break; + + edge = list_entry(list.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + kfree(edge); + } + return ERR_PTR(err); + } + return node; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __add_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + BUG_ON(!node); + + node->bytenr = root->node->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to update/delete the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, int del) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + BUG_ON((struct btrfs_root *)node->data != root); + + if (!del) { + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = root->node->start; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + } else { + list_del_init(&root->root_list); + kfree(node); + } + return 0; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!root->fs_info->reloc_ctl || + !root->fs_info->reloc_ctl->create_reloc_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = root->root_key.objectid; + + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(IS_ERR(reloc_root)); + reloc_root->last_trans = trans->transid; + + __add_reloc_root(reloc_root); + root->reloc_root = reloc_root; + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int del = 0; + int ret; + + if (!root->reloc_root) + return 0; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_root_refs(root_item) == 0) { + root->reloc_root = NULL; + del = 1; + } + + __update_reloc_root(reloc_root, del); + + if (reloc_root->commit_root != reloc_root->node) { + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, root_item); + BUG_ON(ret); + return 0; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = entry->vfs_inode.i_ino + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +static int in_block_group(u64 bytenr, + struct btrfs_block_group_cache *block_group) +{ + if (bytenr >= block_group->key.objectid && + bytenr < block_group->key.objectid + block_group->key.offset) + return 1; + return 0; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = 1; + goto out; + } + + if (new_bytenr) + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct list_head *inode_list) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct inodevec *ivec = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_block_group(bytenr, rc->block_group)) + continue; + + /* + * if we are modifying block in fs tree, wait for readpage + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!ivec || ivec->nr == INODEVEC_SIZE) { + ivec = kmalloc(sizeof(*ivec), GFP_NOFS); + BUG_ON(!ivec); + ivec->nr = 0; + list_add_tail(&ivec->list, inode_list); + } + if (first) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + first = 0; + } else if (inode && inode->i_ino < key.objectid) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + } + if (inode && inode->i_ino == key.objectid) { + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, + GFP_NOFS); + if (!ret) + continue; + + btrfs_drop_extent_cache(inode, key.offset, end, + 1); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, GFP_NOFS); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + num_bytes, parent, + btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + parent, btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + } + if (dirty) + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + struct extent_buffer **leaf, + int lowest_level, int max_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int level; + int ret; + int slot; + + BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(lowest_level > 1 && leaf); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); + + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + btrfs_set_lock_blocking(eb); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + BUG_ON(level < lowest_level); + + ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = btrfs_level_size(dest, level - 1); + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (new_bytenr > 0 && new_bytenr == old_bytenr) { + WARN_ON(1); + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level && !leaf) { + ret = 0; + break; + } + + eb = read_tree_block(dest, old_bytenr, blocksize, + old_ptr_gen); + btrfs_tree_lock(eb); + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (level <= lowest_level) { + *leaf = eb; + ret = 0; + break; + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(src, path); + + path->lowest_level = level; + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + path->lowest_level = 0; + BUG_ON(ret); + + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(path->nodes[level]); + + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 bytenr; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 blocksize; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + bytenr = btrfs_node_blockptr(eb, path->slots[i]); + blocksize = btrfs_level_size(root, i - 1); + eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + + objectid = min_key->objectid; + while (1) { + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + + if (inode->i_ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = inode->i_ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == inode->i_ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == inode->i_ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for readpage to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + btrfs_drop_extent_cache(inode, start, end, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + } + return 0; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + LIST_HEAD(inode_list); + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf = NULL; + unsigned long nr; + int level; + int max_level; + int replaced = 0; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + extent_buffer_get(reloc_root->node); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { + trans = btrfs_start_transaction(root, 1); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(reloc_root, path); + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + btrfs_unlock_up_safe(path, 1); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + if (ret < 0) + err = ret; + goto out; + } + + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + leaf = NULL; + replaced = 0; + trans = btrfs_start_transaction(root, 1); + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_path(trans, root, reloc_root, + path, &next_key, &leaf, + level, max_level); + } else { + ret = replace_path(trans, root, reloc_root, + path, &next_key, NULL, + level, max_level); + } + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } else if (leaf) { + /* + * no block got replaced, try replacing file extents + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + BUG_ON(ret < 0); + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + if (ret < 0) + err = ret; +out: + btrfs_free_path(path); + + if (err == 0) { + memset(&root_item->drop_progress, 0, + sizeof(root_item->drop_progress)); + root_item->drop_level = 0; + btrfs_set_root_refs(root_item, 0); + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + /* + * put inodes while we aren't holding the tree locks + */ + while (!list_empty(&inode_list)) { + struct inodevec *ivec; + ivec = list_entry(inode_list.next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return err; +} + +/* + * callback for the work threads. + * this function merges reloc tree with corresponding fs tree, + * and then drops the reloc tree. + */ +static void merge_func(struct btrfs_work *work) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_root *reloc_root; + struct async_merge *async; + + async = container_of(work, struct async_merge, work); + reloc_root = async->root; + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + merge_reloc_root(async->rc, root); + + trans = btrfs_start_transaction(root, 1); + btrfs_update_reloc_root(trans, root); + btrfs_end_transaction(trans, root); + } + + btrfs_drop_dead_root(reloc_root); + + if (atomic_dec_and_test(async->num_pending)) + complete(async->done); + + kfree(async); +} + +static int merge_reloc_roots(struct reloc_control *rc) +{ + struct async_merge *async; + struct btrfs_root *root; + struct completion done; + atomic_t num_pending; + + init_completion(&done); + atomic_set(&num_pending, 1); + + while (!list_empty(&rc->reloc_roots)) { + root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + + async = kmalloc(sizeof(*async), GFP_NOFS); + BUG_ON(!async); + async->work.func = merge_func; + async->work.flags = 0; + async->rc = rc; + async->root = root; + async->done = &done; + async->num_pending = &num_pending; + atomic_inc(&num_pending); + btrfs_queue_worker(&rc->workers, &async->work); + } + + if (!atomic_dec_and_test(&num_pending)) + wait_for_completion(&done); + + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + return 0; +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_root *root; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + return btrfs_record_root_in_trans(trans, root); +} + +/* + * select one tree from trees that references the block. + * for blocks in refernce counted trees, we preper reloc tree. + * if no reloc tree found and reloc_only is true, NULL is returned. + */ +static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], + int *nr, int reloc_only) +{ + struct backref_node *next; + struct btrfs_root *root; + int index; + int loop = 0; +again: + index = 0; + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + if (!root) { + BUG_ON(!node->old_root); + goto skip; + } + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) { + BUG_ON(reloc_only); + break; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + record_reloc_root_in_trans(trans, root); + break; + } + + if (loop) { + btrfs_record_root_in_trans(trans, root); + break; + } + + if (reloc_only || next != node) { + if (!root->reloc_root) + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + /* + * if the reloc tree was created in current + * transation, there is no node in backref tree + * corresponds to the root of the reloc tree. + */ + if (btrfs_root_last_snapshot(&root->root_item) == + trans->transid - 1) + break; + } +skip: + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!root && !loop && !reloc_only) { + loop = 1; + goto again; + } + + if (root) + *nr = index; + else + *nr = 0; + + return root; +} + +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node) +{ + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int nr; + return __select_one_root(trans, node, edges, &nr, 0); +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], int *nr) +{ + return __select_one_root(trans, node, edges, nr, 1); +} + +static void grab_path_buffers(struct btrfs_path *path, + struct backref_node *node, + struct backref_edge *edges[], int nr) +{ + int i = 0; + while (1) { + drop_node_buffer(node); + node->eb = path->nodes[node->level]; + BUG_ON(!node->eb); + if (path->locks[node->level]) + node->locked = 1; + path->nodes[node->level] = NULL; + path->locks[node->level] = 0; + + if (i >= nr) + break; + + edges[i]->blockptr = node->eb->start; + node = edges[i]->node[UPPER]; + i++; + } +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct backref_node *upper; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + u64 generation; + int nr; + int slot; + int ret; + int err = 0; + + BUG_ON(lowest && node->eb); + + path->lowest_level = node->level + 1; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + cond_resched(); + if (node->eb && node->eb->start == edge->blockptr) + continue; + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, upper, edges, &nr); + if (!root) + continue; + + if (upper->eb && !upper->locked) + drop_node_buffer(upper); + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + + slot = path->slots[upper->level]; + + btrfs_unlock_up_safe(path, upper->level + 1); + grab_path_buffers(path, upper, edges, nr); + + btrfs_release_path(NULL, path); + } else { + ret = btrfs_bin_search(upper->eb, key, upper->level, + &slot); + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (!lowest) { + if (node->eb->start == bytenr) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + continue; + } + } else { + BUG_ON(node->bytenr != bytenr); + } + + blocksize = btrfs_level_size(root, node->level); + generation = btrfs_node_ptr_generation(upper->eb, slot); + eb = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb); + if (ret < 0) { + err = ret; + break; + } + btrfs_set_lock_blocking(eb); + node->eb = eb; + node->locked = 1; + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(upper->eb); + + ret = btrfs_inc_extent_ref(trans, root, + node->eb->start, blocksize, + upper->eb->start, + btrfs_header_owner(upper->eb), + node->level, 0); + BUG_ON(ret); + + ret = btrfs_drop_subtree(trans, root, eb, upper->eb); + BUG_ON(ret); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + if (!lowest) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + } + } + path->lowest_level = 0; + return err; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + if (!node->eb || list_empty(&node->upper)) + return 0; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct backref_cache *cache, + struct btrfs_path *path) +{ + struct backref_node *node; + int level; + int ret; + int err = 0; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct backref_node, lower); + BUG_ON(node->level != level); + + ret = link_to_upper(trans, node, path); + if (ret < 0) + err = ret; + /* + * this remove the node from the pending list and + * may add some other nodes to the level + 1 + * pending list + */ + remove_backref_node(cache, node); + } + } + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + return err; +} + +static void mark_block_processed(struct reloc_control *rc, + struct backref_node *node) +{ + u32 blocksize; + if (node->level == 0 || + in_block_group(node->bytenr, rc->block_group)) { + blocksize = btrfs_level_size(rc->extent_root, node->level); + set_extent_bits(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + GFP_NOFS); + } + node->processed = 1; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct backref_node *node) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) +{ + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + return 1; + return 0; +} + +/* + * check if there are any file extent pointers in the leaf point to + * data require processing + */ +static int check_file_extents(struct reloc_control *rc, + u64 bytenr, u32 blocksize, u64 ptr_gen) +{ + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u32 nritems; + int i; + int ret = 0; + + leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &found_key, i); + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) + continue; + if (in_block_group(bytenr, rc->block_group)) { + ret = 1; + break; + } + } + free_extent_buffer(leaf); + return ret; +} + +/* + * scan child blocks of a given block to find blocks require processing + */ +static int add_child_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 nritems; + int i; + int err = 0; + + nritems = btrfs_header_nritems(node->eb); + blocksize = btrfs_level_size(rc->extent_root, node->level - 1); + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + + readahead_tree_block(rc->extent_root, + bytenr, blocksize, ptr_gen); + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + if (!in_block_group(bytenr, rc->block_group) && + !check_file_extents(rc, bytenr, blocksize, ptr_gen)) + continue; + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = bytenr; + btrfs_node_key_to_cpu(node->eb, &block->key, i); + block->level = node->level - 1; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + } + if (err) + free_block_list(blocks); + return err; +} + +/* + * find adjacent blocks require processing + */ +static noinline_for_stack +int add_adjacent_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_cache *cache, + struct rb_root *blocks, int level, + struct backref_node **upper) +{ + struct backref_node *node; + int ret = 0; + + WARN_ON(!list_empty(&cache->pending[level])); + + if (list_empty(&cache->pending[level + 1])) + return 1; + + node = list_entry(cache->pending[level + 1].next, + struct backref_node, lower); + if (node->eb) + ret = add_child_blocks(trans, rc, node, blocks); + + *upper = node; + return ret; +} + +static int get_tree_block_key(struct reloc_control *rc, + struct tree_block *block) +{ + struct extent_buffer *eb; + + BUG_ON(block->key_ready); + eb = read_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + WARN_ON(btrfs_header_level(eb) != block->level); + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +static int reada_tree_block(struct reloc_control *rc, + struct tree_block *block) +{ + BUG_ON(block->key_ready); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int ret; + + root = select_one_root(trans, node); + if (unlikely(!root)) { + rc->found_old_snapshot = 1; + update_processed_blocks(rc, node); + return 0; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = do_relocation(trans, node, key, path, 1); + if (ret < 0) + goto out; + if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, + node->eb, NULL); + if (ret < 0) + goto out; + } + drop_node_buffer(node); + } else if (!root->ref_cows) { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret < 0) + goto out; + } else if (root != node->root) { + WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + } + + update_processed_blocks(rc, node); + ret = 0; +out: + drop_node_buffer(node); + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct backref_cache *cache; + struct backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct rb_node *rb_node; + int level = -1; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + cache = kmalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + btrfs_free_path(path); + return -ENOMEM; + } + + backref_cache_init(cache); + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (level == -1) + level = block->level; + else + BUG_ON(level != block->level); + if (!block->key_ready) + reada_tree_block(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + get_tree_block_key(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + + node = build_backref_tree(rc, cache, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + + if (level > 0) + goto out; + + free_block_list(blocks); + + /* + * now backrefs of some upper level tree blocks have been cached, + * try relocating blocks referenced by these upper level blocks. + */ + while (1) { + struct backref_node *upper = NULL; + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + break; + + ret = add_adjacent_blocks(trans, rc, cache, blocks, level, + &upper); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + goto out; + BUG_ON(!block->key_ready); + node = build_backref_tree(rc, cache, &block->key, + level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, + &block->key, path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + free_block_list(blocks); + + if (upper) { + ret = link_to_upper(trans, upper, path); + if (ret < 0) { + err = ret; + break; + } + remove_backref_node(cache, upper); + } + } +out: + free_block_list(blocks); + + ret = finish_pending_nodes(trans, cache, path); + if (ret < 0) + err = ret; + + kfree(cache); + btrfs_free_path(path); + return err; +} + +static noinline_for_stack +int relocate_inode_pages(struct inode *inode, u64 start, u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long i; + unsigned long first_index; + unsigned long last_index; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + struct page *page; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + min(last_index, ra->ra_pages + i - 1)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } +out_unlock: + mutex_unlock(&inode->i_mutex); + kfree(ra); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(inode, start, extent_key->offset); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int get_ref_objectid_v0(struct reloc_control *rc, + struct btrfs_path *path, + struct btrfs_key *extent_key, + u64 *ref_objectid, int *path_change) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref_v0 *ref0; + int ret; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + while (1) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path_change) + *path_change = 1; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != extent_key->objectid) + return -ENOENT; + + if (key.type != BTRFS_EXTENT_REF_V0_KEY) { + slot++; + continue; + } + ref0 = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_ref_v0); + *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + return 0; +} +#endif + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + int generation; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (item_size >= sizeof(*ei) + sizeof(*bi)) { + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + bi = (struct btrfs_tree_block_info *)(ei + 1); + generation = btrfs_extent_generation(eb, ei); + level = btrfs_tree_block_level(eb, bi); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int ret; + + BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, extent_key, + &ref_owner, NULL); + BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); + level = (int)ref_owner; + /* FIXME: get real generation */ + generation = 0; +#else + BUG(); +#endif + } + + btrfs_release_path(rc->extent_root, path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = extent_key->offset; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (tree_block_processed(bytenr, blocksize, rc)) + return 0; + + if (tree_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to check if the block use full backrefs for pointers in it + */ +static int block_use_full_backref(struct reloc_control *rc, + struct extent_buffer *eb) +{ + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u64 flags; + int ret; + + if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || + btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) + return 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = eb->start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = eb->len; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + flags = btrfs_extent_flags(path->nodes[0], ei); + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = 1; + else + ret = 0; + btrfs_free_path(path); + return ret; +} + +/* + * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY + * this function scans fs tree to find blocks reference the data extent + */ +static int find_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct tree_block *block; + struct btrfs_root *root; + struct btrfs_file_extent_item *fi; + struct rb_node *rb_node; + struct btrfs_key key; + u64 ref_root; + u64 ref_objectid; + u64 ref_offset; + u32 ref_count; + u32 nritems; + int err = 0; + int added = 0; + int counted; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ref_root = btrfs_extent_data_ref_root(leaf, ref); + ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); + ref_offset = btrfs_extent_data_ref_offset(leaf, ref); + ref_count = btrfs_extent_data_ref_count(leaf, ref); + + root = read_fs_root(rc->extent_root->fs_info, ref_root); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + key.objectid = ref_objectid; + key.offset = ref_offset; + key.type = BTRFS_EXTENT_DATA_KEY; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + /* + * the references in tree blocks that use full backrefs + * are not counted in + */ + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + + while (ref_count > 0) { + while (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + WARN_ON(1); + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + added = 0; + + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) { + WARN_ON(1); + break; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + goto next; + + if (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid) + goto next; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + if (key.offset != ref_offset) + goto next; + + if (counted) + ref_count--; + if (added) + goto next; + + if (!tree_block_processed(leaf->start, leaf->len, rc)) { + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = leaf->start; + btrfs_item_key_to_cpu(leaf, &block->key, 0); + block->level = 0; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, + &block->rb_node); + BUG_ON(rb_node); + } + if (counted) + added = 1; + else + path->slots[0] = nritems; +next: + path->slots[0]++; + + } +out: + btrfs_free_path(path); + return err; +} + +/* + * hepler to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_data_ref *dref; + struct btrfs_extent_inline_ref *iref; + unsigned long ptr; + unsigned long end; + u32 blocksize; + int ret; + int err = 0; + + ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, + extent_key->offset); + BUG_ON(ret < 0); + if (ret > 0) { + /* the relocated data is fragmented */ + rc->extents_skipped++; + btrfs_release_path(rc->extent_root, path); + return 0; + } + + blocksize = btrfs_level_size(rc->extent_root, 0); + + eb = path->nodes[0]; + ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + end = ptr + btrfs_item_size_nr(eb, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ptr + sizeof(struct btrfs_extent_item_v0) == end) + ptr = end; + else +#endif + ptr += sizeof(struct btrfs_extent_item); + + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + BUG(); + } + ptr += btrfs_extent_inline_ref_size(key.type); + } + WARN_ON(ptr > end); + + while (1) { + cond_resched(); + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) { + err = ret; + break; + } + if (ret > 0) + break; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_key->objectid) + break; + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_DATA_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +#endif + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + ret = 0; + } + if (ret) { + err = ret; + break; + } + path->slots[0]++; + } + btrfs_release_path(rc->extent_root, path); + if (err) + free_block_list(blocks); + return err; +} + +/* + * hepler to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_path *path) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->key.objectid + rc->block_group->key.offset; + while (1) { + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + ret = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY); + + if (ret == 0 && start <= key.objectid) { + btrfs_release_path(rc->extent_root, path); + rc->search_start = end + 1; + } else { + rc->search_start = key.objectid + key.offset; + return 0; + } + } + btrfs_release_path(rc->extent_root, path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->trans_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->trans_mutex); +} + +static int check_extent_flags(u64 flags) +{ + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if (!(flags & BTRFS_EXTENT_FLAG_DATA) && + !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + return 1; + return 0; +} + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + unsigned long nr; + u64 flags; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rc->search_start = rc->block_group->key.objectid; + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); + + rc->create_reloc_root = 1; + set_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + while (1) { + trans = btrfs_start_transaction(rc->extent_root, 1); + + ret = find_next_extent(trans, rc, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (item_size >= sizeof(*ei)) { + flags = btrfs_extent_flags(path->nodes[0], ei); + ret = check_extent_flags(flags); + BUG_ON(ret); + + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int path_change = 0; + + BUG_ON(item_size != + sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, + &path_change); + if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) + flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else + flags = BTRFS_EXTENT_FLAG_DATA; + + if (path_change) { + btrfs_release_path(rc->extent_root, path); + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + } +#else + BUG(); +#endif + } + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(rc->extent_root, path); + ret = 0; + } + if (ret < 0) { + err = 0; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + err = ret; + break; + } + } + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + trans = NULL; + btrfs_btree_balance_dirty(rc->extent_root, nr); + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, &key); + if (ret < 0) { + err = ret; + break; + } + } + } + btrfs_free_path(path); + + if (trans) { + nr = trans->blocks_used; + btrfs_end_transaction(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + + rc->create_reloc_root = 0; + smp_mb(); + + if (rc->extents_found > 0) { + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + } + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + /* get rid of pinned extents */ + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 size) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key key; + unsigned long nr; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid, group->key.offset); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, + group->key.offset, 0, group->key.offset, + 0, 0, 0); + BUG_ON(err); + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct reloc_control *rc; + int ret; + int err = 0; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return -ENOMEM; + + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + INIT_LIST_HEAD(&rc->reloc_roots); + + rc->block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!rc->block_group); + + btrfs_init_workers(&rc->workers, "relocate", + fs_info->thread_pool_size); + + rc->extent_root = extent_root; + btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + (unsigned long long)rc->block_group->key.objectid, + (unsigned long long)rc->block_group->flags); + + btrfs_start_delalloc_inodes(fs_info->tree_root); + btrfs_wait_ordered_extents(fs_info->tree_root, 0); + + while (1) { + mutex_lock(&fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(fs_info->tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + + rc->extents_found = 0; + rc->extents_skipped = 0; + + ret = relocate_block_group(rc); + if (ret < 0) { + err = ret; + break; + } + + if (rc->extents_found == 0) + break; + + printk(KERN_INFO "btrfs: found %llu extents\n", + (unsigned long long)rc->extents_found); + + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } else if (rc->stage == UPDATE_DATA_PTRS && + rc->extents_skipped >= rc->extents_found) { + iput(rc->data_inode); + rc->data_inode = create_reloc_inode(fs_info, + rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + break; + } + rc->stage = MOVE_DATA_EXTENTS; + rc->found_file_extent = 0; + } + } + + filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); +out: + iput(rc->data_inode); + btrfs_stop_workers(&rc->workers); + btrfs_put_block_group(rc->block_group); + kfree(rc); + return err; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_root *root) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root->fs_info->tree_root, path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_fs_root_no_radix(root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out; + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(root->fs_info->tree_root, path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) { + err = -ENOMEM; + goto out; + } + + mapping_tree_init(&rc->reloc_root_tree); + INIT_LIST_HEAD(&rc->reloc_roots); + btrfs_init_workers(&rc->workers, "relocate", + root->fs_info->thread_pool_size); + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(fs_root)); + + __add_reloc_root(reloc_root); + fs_root->reloc_root = reloc_root; + } + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); +out: + if (rc) { + btrfs_stop_workers(&rc->workers); + kfree(rc); + } + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = read_fs_root(root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + size_t offset; + int ret; + u64 disk_bytenr; + LIST_HEAD(list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b48650de447..0ddc6d61c55 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -111,6 +111,15 @@ out: return ret; } +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); + return 0; +} + /* * copy the data in 'item' into the btree */ @@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root * offset lower than the latest root. They need to be queued for deletion to * finish what was happening when we crashed. */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, - struct btrfs_root *latest) +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) { struct btrfs_root *dead_root; struct btrfs_item *item; @@ -227,10 +235,7 @@ again: goto err; } - if (objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_add_dead_reloc_root(dead_root); - else - ret = btrfs_add_dead_root(dead_root, latest); + ret = btrfs_add_dead_root(dead_root); if (ret) goto err; goto again; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9744af9d71e..9f179d4832d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,7 +52,6 @@ #include "export.h" #include "compression.h" - static struct super_operations btrfs_super_ops; static void btrfs_put_super(struct super_block *sb) @@ -67,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, - Opt_flushoncommit, Opt_err, + Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -84,9 +83,12 @@ static match_table_t tokens = { {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd, "nossd"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, {Opt_err, NULL}, }; @@ -157,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatacsum\n"); + printk(KERN_INFO "btrfs: setting nodatasum\n"); btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: @@ -173,6 +175,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); break; + case Opt_ssd_spread: + printk(KERN_INFO "btrfs: use spread ssd " + "allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_nossd: + printk(KERN_INFO "btrfs: not using ssd allocation " + "scheme\n"); + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; case Opt_nobarrier: printk(KERN_INFO "btrfs: turning off barriers\n"); btrfs_set_opt(info->mount_opt, NOBARRIER); @@ -195,7 +210,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_extent = max_t(u64, info->max_extent, root->sectorsize); printk(KERN_INFO "btrfs: max_extent at %llu\n", - info->max_extent); + (unsigned long long)info->max_extent); } break; case Opt_max_inline: @@ -210,7 +225,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - info->max_inline); + (unsigned long long)info->max_inline); } break; case Opt_alloc_start: @@ -220,7 +235,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - info->alloc_start); + (unsigned long long)info->alloc_start); } break; case Opt_noacl: @@ -234,6 +249,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; default: break; } @@ -312,7 +336,7 @@ static int btrfs_fill_super(struct super_block *sb, struct dentry *root_dentry; struct btrfs_super_block *disk_super; struct btrfs_root *tree_root; - struct btrfs_inode *bi; + struct btrfs_key key; int err; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -331,23 +355,15 @@ static int btrfs_fill_super(struct super_block *sb, } sb->s_fs_info = tree_root; disk_super = &tree_root->fs_info->super_copy; - inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, - tree_root->fs_info->fs_root); - bi = BTRFS_I(inode); - bi->location.objectid = inode->i_ino; - bi->location.offset = 0; - bi->root = tree_root->fs_info->fs_root; - btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); - - if (!inode) { - err = -ENOMEM; + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto fail_close; } - if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } root_dentry = d_alloc_root(inode); if (!root_dentry) { @@ -378,10 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_root *root = btrfs_sb(sb); int ret; - if (sb->s_flags & MS_RDONLY) - return 0; - - sb->s_dirt = 0; if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; @@ -392,7 +404,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); - sb->s_dirt = 0; return ret; } @@ -410,32 +421,34 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_extent != (u64)-1) - seq_printf(seq, ",max_extent=%llu", info->max_extent); + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", info->max_inline); + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); if (btrfs_test_opt(root, COMPRESS)) seq_puts(seq, ",compress"); - if (btrfs_test_opt(root, SSD)) + if (btrfs_test_opt(root, NOSSD)) + seq_puts(seq, ",nossd"); + if (btrfs_test_opt(root, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(root, SSD)) seq_puts(seq, ",ssd"); if (btrfs_test_opt(root, NOTREELOG)) - seq_puts(seq, ",no-treelog"); + seq_puts(seq, ",notreelog"); if (btrfs_test_opt(root, FLUSHONCOMMIT)) - seq_puts(seq, ",flush-on-commit"); + seq_puts(seq, ",flushoncommit"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); return 0; } -static void btrfs_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - static int btrfs_test_super(struct super_block *s, void *data) { struct btrfs_fs_devices *test_fs_devices = data; @@ -489,8 +502,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); error = -EBUSY; goto error_close_devices; } @@ -504,8 +516,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); goto error_free_subvol_name; } @@ -522,15 +533,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, mutex_unlock(&s->s_root->d_inode->i_mutex); if (IS_ERR(root)) { - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); error = PTR_ERR(root); goto error_free_subvol_name; } if (!root->d_inode) { dput(root); - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); error = -ENXIO; goto error_free_subvol_name; } @@ -575,7 +584,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_reloc_trees(root); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -635,14 +645,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol = kmalloc(sizeof(*vol), GFP_KERNEL); - if (!vol) - return -ENOMEM; - - if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { - ret = -EFAULT; - goto out; - } + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -650,7 +655,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, &btrfs_fs_type, &fs_devices); break; } -out: + kfree(vol); return ret; } @@ -674,7 +679,6 @@ static int btrfs_unfreeze(struct super_block *sb) static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, - .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2869b3361eb..2e177d7f4bb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -25,7 +25,6 @@ #include "disk-io.h" #include "transaction.h" #include "locking.h" -#include "ref-cache.h" #include "tree-log.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -noinline int btrfs_record_root_in_trans(struct btrfs_root *root) +static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; - u64 running_trans_id = root->fs_info->running_transaction->transid; - if (root->ref_cows && root->last_trans < running_trans_id) { + if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); - if (root->root_item.refs != 0) { - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - BUG_ON(!dirty); - dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); - BUG_ON(!dirty->root); - dirty->latest_root = root; - INIT_LIST_HEAD(&dirty->list); - - root->commit_root = btrfs_root_node(root); - - memcpy(dirty->root, root, sizeof(*root)); - spin_lock_init(&dirty->root->node_lock); - spin_lock_init(&dirty->root->list_lock); - mutex_init(&dirty->root->objectid_mutex); - mutex_init(&dirty->root->log_mutex); - INIT_LIST_HEAD(&dirty->root->dead_list); - dirty->root->node = root->commit_root; - dirty->root->commit_root = NULL; + WARN_ON(root->root_item.refs == 0); + WARN_ON(root->commit_root != root->node); + + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + root->last_trans = trans->transid; + btrfs_init_reloc_root(trans, root); + } + return 0; +} - spin_lock(&root->list_lock); - list_add(&dirty->root->dead_list, &root->dead_list); - spin_unlock(&root->list_lock); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; - root->dirty_root = dirty; - } else { - WARN_ON(1); - } - root->last_trans = running_trans_id; + mutex_lock(&root->fs_info->trans_mutex); + if (root->last_trans == trans->transid) { + mutex_unlock(&root->fs_info->trans_mutex); + return 0; } + + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->trans_mutex); return 0; } @@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, ret = join_transaction(root); BUG_ON(ret); - btrfs_record_root_in_trans(root); h->transid = root->fs_info->running_transaction->transid; h->transaction = root->fs_info->running_transaction; h->blocks_reserved = num_blocks; @@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->delayed_ref_updates = 0; root->fs_info->running_transaction->use_count++; + record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); return h; } @@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +#if 0 /* * rate limit against the drop_snapshot code. This helps to slow down new * operations if the drop_snapshot code isn't able to keep up. @@ -273,6 +265,7 @@ harder: goto harder; } } +#endif void btrfs_throttle(struct btrfs_root *root) { @@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); - throttle_on_drops(root); } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (throttle) - throttle_on_drops(root); - return 0; } @@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) break; - btrfs_set_root_bytenr(&root->root_item, - root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(root->node)); - btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_node(&root->root_item, root->node); ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); @@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); } + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); return 0; } /* * update all the cowonly tree roots on disk */ -int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; @@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) +int btrfs_add_dead_root(struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; - - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - if (!dirty) - return -ENOMEM; - dirty->root = root; - dirty->latest_root = latest; - mutex_lock(&root->fs_info->trans_mutex); - list_add(&dirty->list, &latest->fs_info->dead_roots); + list_add(&root->root_list, &root->fs_info->dead_roots); mutex_unlock(&root->fs_info->trans_mutex); return 0; } /* - * at transaction commit time we need to schedule the old roots for - * deletion via btrfs_drop_snapshot. This runs through all the - * reference counted roots that were modified in the current - * transaction and puts them into the drop list + * update all the cowonly tree roots on disk */ -static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, - struct radix_tree_root *radix, - struct list_head *list) +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; struct btrfs_root *gang[8]; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info = root->fs_info; int i; int ret; int err = 0; - u32 refs; while (1) { - ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { root = gang[i]; - radix_tree_tag_clear(radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - - BUG_ON(!root->ref_tree); - dirty = root->dirty_root; + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); btrfs_free_log(trans, root); - btrfs_free_reloc_root(trans, root); - - if (root->commit_root == root->node) { - WARN_ON(root->node->start != - btrfs_root_bytenr(&root->root_item)); - - free_extent_buffer(root->commit_root); - root->commit_root = NULL; - root->dirty_root = NULL; - - spin_lock(&root->list_lock); - list_del_init(&dirty->root->dead_list); - spin_unlock(&root->list_lock); + btrfs_update_reloc_root(trans, root); - kfree(dirty->root); - kfree(dirty); - - /* make sure to update the root on disk - * so we get any updates to the block used - * counts - */ - err = btrfs_update_root(trans, - root->fs_info->tree_root, - &root->root_key, - &root->root_item); + if (root->commit_root == root->node) continue; - } - memset(&root->root_item.drop_progress, 0, - sizeof(struct btrfs_disk_key)); - root->root_item.drop_level = 0; - root->commit_root = NULL; - root->dirty_root = NULL; - root->root_key.offset = root->fs_info->generation; - btrfs_set_root_bytenr(&root->root_item, - root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(root->node)); - btrfs_set_root_generation(&root->root_item, - root->root_key.offset); - - err = btrfs_insert_root(trans, root->fs_info->tree_root, + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + + btrfs_set_root_node(&root->root_item, root->node); + err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (err) break; - - refs = btrfs_root_refs(&dirty->root->root_item); - btrfs_set_root_refs(&dirty->root->root_item, refs - 1); - err = btrfs_update_root(trans, root->fs_info->tree_root, - &dirty->root->root_key, - &dirty->root->root_item); - - BUG_ON(err); - if (refs == 1) { - list_add(&dirty->list, list); - } else { - WARN_ON(1); - free_extent_buffer(dirty->root->node); - kfree(dirty->root); - kfree(dirty); - } } } return err; @@ -687,7 +610,9 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) prepare_to_wait(&info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); + schedule(); + mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } @@ -699,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on * all of them */ -static noinline int drop_dirty_roots(struct btrfs_root *tree_root, - struct list_head *list) +int btrfs_drop_dead_root(struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; unsigned long nr; - u64 num_bytes; - u64 bytes_used; - u64 max_useless; - int ret = 0; - int err; - - while (!list_empty(list)) { - struct btrfs_root *root; - - dirty = list_entry(list->prev, struct btrfs_dirty_root, list); - list_del_init(&dirty->list); - - num_bytes = btrfs_root_used(&dirty->root->root_item); - root = dirty->latest_root; - atomic_inc(&root->fs_info->throttles); - - while (1) { - /* - * we don't want to jump in and create a bunch of - * delayed refs if the transaction is starting to close - */ - wait_transaction_pre_flush(tree_root->fs_info); - trans = btrfs_start_transaction(tree_root, 1); - - /* - * we've joined a transaction, make sure it isn't - * closing right now - */ - if (trans->transaction->delayed_refs.flushing) { - btrfs_end_transaction(trans, tree_root); - continue; - } - - mutex_lock(&root->fs_info->drop_mutex); - ret = btrfs_drop_snapshot(trans, dirty->root); - if (ret != -EAGAIN) - break; - mutex_unlock(&root->fs_info->drop_mutex); + int ret; - err = btrfs_update_root(trans, - tree_root, - &dirty->root->root_key, - &dirty->root->root_item); - if (err) - ret = err; - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, tree_root); - BUG_ON(ret); + while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); + trans = btrfs_start_transaction(tree_root, 1); - btrfs_btree_balance_dirty(tree_root, nr); - cond_resched(); + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; } - BUG_ON(ret); - atomic_dec(&root->fs_info->throttles); - wake_up(&root->fs_info->transaction_throttle); - num_bytes -= btrfs_root_used(&dirty->root->root_item); - bytes_used = btrfs_root_used(&root->root_item); - if (num_bytes) { - mutex_lock(&root->fs_info->trans_mutex); - btrfs_record_root_in_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); - btrfs_set_root_used(&root->root_item, - bytes_used - num_bytes); - } + ret = btrfs_drop_snapshot(trans, root); + if (ret != -EAGAIN) + break; - ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); - if (ret) { - BUG(); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + if (ret) break; - } - mutex_unlock(&root->fs_info->drop_mutex); - - spin_lock(&root->list_lock); - list_del_init(&dirty->root->dead_list); - if (!list_empty(&root->dead_list)) { - struct btrfs_root *oldest; - oldest = list_entry(root->dead_list.prev, - struct btrfs_root, dead_list); - max_useless = oldest->root_key.offset - 1; - } else { - max_useless = root->root_key.offset - 1; - } - spin_unlock(&root->list_lock); nr = trans->blocks_used; ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); - ret = btrfs_remove_leaf_refs(root, max_useless, 0); - BUG_ON(ret); - - free_extent_buffer(dirty->root->node); - kfree(dirty->root); - kfree(dirty); - btrfs_btree_balance_dirty(tree_root, nr); cond_resched(); } + BUG_ON(ret); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + + btrfs_btree_balance_dirty(tree_root, nr); return ret; } @@ -833,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) goto fail; - btrfs_record_root_in_trans(root); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); key.objectid = objectid; - key.offset = trans->transid; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); + btrfs_set_lock_blocking(old); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_set_root_bytenr(new_root_item, tmp->start); - btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); - btrfs_set_root_generation(new_root_item, trans->transid); + btrfs_set_root_node(new_root_item, tmp); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); @@ -958,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, return 0; } +static void update_super_roots(struct btrfs_root *root) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = &root->fs_info->super_copy; + + root_item = &root->fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &root->fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -965,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; - struct btrfs_root *chunk_root = root->fs_info->chunk_root; - struct list_head dirty_fs_roots; struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; @@ -993,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); mutex_lock(&root->fs_info->trans_mutex); - INIT_LIST_HEAD(&dirty_fs_roots); if (cur_trans->in_commit) { cur_trans->use_count++; mutex_unlock(&root->fs_info->trans_mutex); @@ -1099,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * with the tree-log code. */ mutex_lock(&root->fs_info->tree_log_mutex); - /* - * keep tree reloc code from adding new reloc trees - */ - mutex_lock(&root->fs_info->tree_reloc_mutex); - - ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, - &dirty_fs_roots); + ret = commit_fs_roots(trans, root); BUG_ON(ret); - /* add_dirty_roots gets rid of all the tree log roots, it is now + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ btrfs_free_log_root_tree(trans, root->fs_info); - ret = btrfs_commit_tree_roots(trans, root); + ret = commit_cowonly_roots(trans, root); BUG_ON(ret); cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->new_trans_lock); - btrfs_set_super_generation(&root->fs_info->super_copy, - cur_trans->transid); - btrfs_set_super_root(&root->fs_info->super_copy, - root->fs_info->tree_root->node->start); - btrfs_set_super_root_level(&root->fs_info->super_copy, - btrfs_header_level(root->fs_info->tree_root->node)); - - btrfs_set_super_chunk_root(&root->fs_info->super_copy, - chunk_root->node->start); - btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, - btrfs_header_level(chunk_root->node)); - btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, - btrfs_header_generation(chunk_root->node)); + + btrfs_set_root_node(&root->fs_info->tree_root->root_item, + root->fs_info->tree_root->node); + free_extent_buffer(root->fs_info->tree_root->commit_root); + root->fs_info->tree_root->commit_root = + btrfs_root_node(root->fs_info->tree_root); + + btrfs_set_root_node(&root->fs_info->chunk_root->root_item, + root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + root->fs_info->chunk_root->commit_root = + btrfs_root_node(root->fs_info->chunk_root); + + update_super_roots(root); if (!root->fs_info->log_root_recovering) { btrfs_set_super_log_root(&root->fs_info->super_copy, 0); @@ -1147,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->blocked = 0; - wake_up(&root->fs_info->transaction_throttle); wake_up(&root->fs_info->transaction_wait); mutex_unlock(&root->fs_info->trans_mutex); @@ -1164,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root, pinned_copy); kfree(pinned_copy); - btrfs_drop_dead_reloc_roots(root); - mutex_unlock(&root->fs_info->tree_reloc_mutex); - /* do the directory inserts of any pending snapshot creations */ finish_pending_snapshots(trans, root->fs_info); @@ -1180,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); - if (root->fs_info->closing) - list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); - mutex_unlock(&root->fs_info->trans_mutex); kmem_cache_free(btrfs_trans_handle_cachep, trans); - - if (root->fs_info->closing) - drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); return ret; } @@ -1198,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ int btrfs_clean_old_snapshots(struct btrfs_root *root) { - struct list_head dirty_roots; - INIT_LIST_HEAD(&dirty_roots); -again: - mutex_lock(&root->fs_info->trans_mutex); - list_splice_init(&root->fs_info->dead_roots, &dirty_roots); - mutex_unlock(&root->fs_info->trans_mutex); + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->trans_mutex); + list_splice_init(&fs_info->dead_roots, &list); + mutex_unlock(&fs_info->trans_mutex); - if (!list_empty(&dirty_roots)) { - drop_dirty_roots(root, &dirty_roots); - goto again; + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); + list_del_init(&root->root_list); + btrfs_drop_dead_root(root); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 94f5bde2b58..961c3ee5a2e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -62,12 +62,6 @@ struct btrfs_pending_snapshot { struct list_head list; }; -struct btrfs_dirty_root { - struct list_head list; - struct btrfs_root *root; - struct btrfs_root *latest_root; -}; - static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); +int btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_drop_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); -int btrfs_record_root_in_trans(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 25f20ea11f2..c13922206d1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -430,18 +430,16 @@ no_copy: static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { + struct btrfs_key key; struct inode *inode; - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - if (is_bad_inode(inode)) { + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { iput(inode); inode = NULL; } @@ -536,11 +534,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; unsigned long dest_offset; struct btrfs_key ins; @@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); if (ins.objectid > 0) { u64 csum_start; @@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid); + 0, root->root_key.objectid, + key->objectid, offset); } else { /* * insert the extent pointer in the extent * allocation tree */ - ret = btrfs_alloc_logged_extent(trans, root, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid, - &ins); + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); BUG_ON(ret); } btrfs_release_path(root, path); @@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, @@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); @@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, - next); - BUG_ON(ret); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, path->nodes[*level]->start, @@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (orig_level == 0) { - ret = btrfs_drop_leaf_ref(trans, log, - next); - BUG_ON(ret); - } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(log, next->start, @@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - btrfs_set_root_bytenr(&log->root_item, log->node->start); - btrfs_set_root_generation(&log->root_item, trans->transid); - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; root->log_transid++; @@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_keys, ins_sizes, nr); BUG_ON(ret); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_path->slots[0]); @@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds = btrfs_file_extent_disk_bytenr(src, - extent); - u64 dl = btrfs_file_extent_disk_num_bytes(src, - extent); - u64 cs = btrfs_file_extent_offset(src, extent); - u64 cl = btrfs_file_extent_num_bytes(src, - extent);; + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent);; if (btrfs_file_extent_compression(src, extent)) { cs = 0; cl = dl; } - /* ds == 0 is a hole */ - if (ds != 0) { - ret = btrfs_inc_extent_ref(trans, log, - ds, dl, - dst_path->nodes[0]->start, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, - ins_keys[i].objectid); - BUG_ON(ret); - ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums); - BUG_ON(ret); - } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); } } - dst_path->slots[0]++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); @@ -3029,9 +3001,7 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; - mutex_lock(&fs_info->trans_mutex); - btrfs_record_root_in_trans(wc.replay_dest); - mutex_unlock(&fs_info->trans_mutex); + btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -3049,6 +3019,7 @@ again: key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); kfree(log); if (found_key.offset == 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e0913e46972..3ab80e9cd76 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + /* * we try to collect pending bios for a device so we don't get a large * number of procs sending bios down to the same device. This greatly @@ -141,32 +155,49 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *pending; struct backing_dev_info *bdi; struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; struct bio *tail; struct bio *cur; int again = 0; - unsigned long num_run = 0; + unsigned long num_run; + unsigned long num_sync_run; + unsigned long batch_run = 0; unsigned long limit; unsigned long last_waited = 0; + int force_reg = 0; bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* we want to make sure that every time we switch from the sync + * list to the normal list, we unplug + */ + num_sync_run = 0; + loop: spin_lock(&device->io_lock); loop_lock: + num_run = 0; + /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - pending = device->pending_bios; - tail = device->pending_bio_tail; + if (!force_reg && device->pending_sync_bios.head) { + pending_bios = &device->pending_sync_bios; + force_reg = 1; + } else { + pending_bios = &device->pending_bios; + force_reg = 0; + } + + pending = pending_bios->head; + tail = pending_bios->tail; WARN_ON(pending && !tail); - device->pending_bios = NULL; - device->pending_bio_tail = NULL; /* * if pending was null this time around, no bios need processing @@ -176,16 +207,45 @@ loop_lock: * device->running_pending is used to synchronize with the * schedule_bio code. */ - if (pending) { - again = 1; - device->running_pending = 1; - } else { + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { again = 0; device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + spin_unlock(&device->io_lock); + /* + * if we're doing the regular priority list, make sure we unplug + * for any high prio bios we've sent down + */ + if (pending_bios == &device->pending_bios && num_sync_run > 0) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + while (pending) { + + rmb(); + /* we want to work on both lists, but do more bios on the + * sync list than the regular list + */ + if ((num_run > 32 && + pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head) || + (num_run > 64 && pending_bios == &device->pending_sync_bios && + device->pending_bios.head)) { + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + cur = pending; pending = pending->bi_next; cur->bi_next = NULL; @@ -196,19 +256,28 @@ loop_lock: wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - bio_get(cur); submit_bio(cur->bi_rw, cur); - bio_put(cur); num_run++; + batch_run++; + + if (bio_sync(cur)) + num_sync_run++; + + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } /* * we made progress, there is more work to do and the bdi * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && num_run > 16 && + if (pending && bdi_write_congested(bdi) && batch_run > 32 && fs_info->fs_devices->open_devices > 1) { - struct bio *old_head; struct io_context *ioc; ioc = current->io_context; @@ -233,17 +302,17 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } continue; } spin_lock(&device->io_lock); - - old_head = device->pending_bios; - device->pending_bios = pending; - if (device->pending_bio_tail) - tail->bi_next = old_head; - else - device->pending_bio_tail = tail; - + requeue_list(pending_bios, pending, tail); device->running_pending = 1; spin_unlock(&device->io_lock); @@ -251,11 +320,18 @@ loop_lock: goto done; } } + + if (num_sync_run) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + cond_resched(); if (again) goto loop; spin_lock(&device->io_lock); - if (device->pending_bios) + if (device->pending_bios.head || device->pending_sync_bios.head) goto loop_lock; spin_unlock(&device->io_lock); @@ -301,6 +377,7 @@ static noinline int device_list_add(const char *path, memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; + mutex_init(&fs_devices->device_list_mutex); device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -327,7 +404,11 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } INIT_LIST_HEAD(&device->dev_alloc_list); + + mutex_lock(&fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_devices->devices); + mutex_unlock(&fs_devices->device_list_mutex); + device->fs_devices = fs_devices; fs_devices->num_devices++; } @@ -353,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) INIT_LIST_HEAD(&fs_devices->devices); INIT_LIST_HEAD(&fs_devices->alloc_list); INIT_LIST_HEAD(&fs_devices->list); + mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + mutex_lock(&orig->device_list_mutex); list_for_each_entry(orig_dev, &orig->devices, dev_list) { device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) @@ -378,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } + mutex_unlock(&orig->device_list_mutex); return fs_devices; error: + mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } @@ -390,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); again: + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) continue; @@ -409,6 +495,7 @@ again: kfree(device->name); kfree(device); } + mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->seed) { fs_devices = fs_devices->seed; @@ -529,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, device->in_fs_metadata = 0; device->mode = flags; + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + fs_devices->rotating = 1; + fs_devices->open_devices++; if (device->writeable) { fs_devices->rw_devices++; @@ -1056,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device = NULL; devices = &root->fs_info->fs_devices->devices; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(tmp, devices, dev_list) { if (tmp->in_fs_metadata && !tmp->bdev) { device = tmp; break; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); bdev = NULL; bh = NULL; disk_super = NULL; @@ -1116,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto error_brelse; device->in_fs_metadata = 0; + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_del_init(&device->dev_list); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + device->fs_devices->num_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, @@ -1210,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); list_splice_init(&fs_devices->devices, &seed_devices->devices); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { @@ -1335,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; + /* + * we have the volume lock, so we don't need the extra + * device list mutex while reading the list here. + */ list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; @@ -1375,6 +1481,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->io_align = root->sectorsize; device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -1388,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } device->fs_devices = root->fs_info->fs_devices; + + /* + * we don't want write_supers to jump in here with our device + * half setup + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -1396,6 +1509,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->rw_devices++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + root->fs_info->fs_devices->rotating = 1; + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); btrfs_set_super_total_bytes(&root->fs_info->super_copy, total_bytes + device->total_bytes); @@ -1403,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); btrfs_set_super_num_devices(&root->fs_info->super_copy, total_bytes + 1); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { ret = init_first_rw_device(trans, root, device); @@ -1478,7 +1595,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -1605,8 +1722,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, int ret; int i; - printk(KERN_INFO "btrfs relocating chunk %llu\n", - (unsigned long long)chunk_offset); root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; @@ -1875,14 +1990,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->total_bytes = new_size; if (device->writeable) device->fs_devices->total_rw_bytes -= diff; - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); btrfs_end_transaction(trans, root); @@ -1914,7 +2021,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) - goto done; + break; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -1927,6 +2034,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; } + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); done: btrfs_free_path(path); return ret; @@ -2497,7 +2624,7 @@ again: max_errors = 1; } } - if (multi_ret && rw == WRITE && + if (multi_ret && (rw & (1 << BIO_RW)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2762,6 +2889,7 @@ static noinline int schedule_bio(struct btrfs_root *root, int rw, struct bio *bio) { int should_queue = 1; + struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ if (!(rw & (1 << BIO_RW))) { @@ -2783,13 +2911,17 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); + if (bio_sync(bio)) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; - if (device->pending_bio_tail) - device->pending_bio_tail->bi_next = bio; + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; - device->pending_bio_tail = bio; - if (!device->pending_bios) - device->pending_bios = bio; + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; if (device->running_pending) should_queue = 0; @@ -3006,7 +3138,8 @@ static int fill_device_from_item(struct extent_buffer *leaf, unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); - device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2185de72ff7..5139a833f72 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,13 +23,22 @@ #include "async-thread.h" struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; struct btrfs_root *dev_root; - struct bio *pending_bios; - struct bio *pending_bio_tail; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + int running_pending; u64 generation; @@ -52,6 +61,9 @@ struct btrfs_device { /* size of the device */ u64 total_bytes; + /* size of the disk */ + u64 disk_total_bytes; + /* bytes used */ u64 bytes_used; @@ -84,7 +96,12 @@ struct btrfs_fs_devices { u64 rw_devices; u64 total_rw_bytes; struct block_device *latest_bdev; - /* all of the devices in the FS */ + + /* all of the devices in the FS, protected by a mutex + * so we can safely walk it to write out the supers without + * worrying about add/remove by the multi-device code + */ + struct mutex device_list_mutex; struct list_head devices; /* devices not currently being allocated */ @@ -95,6 +112,11 @@ struct btrfs_fs_devices { int seeding; int opened; + + /* set when we find or add a device that doesn't have the + * nonrot flag set + */ + int rotating; }; struct btrfs_bio_stripe { |