diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/Kconfig | 31 | ||||
-rw-r--r-- | fs/btrfs/async-thread.c | 61 | ||||
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 8 | ||||
-rw-r--r-- | fs/btrfs/compression.c | 1 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 315 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 77 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 166 | ||||
-rw-r--r-- | fs/btrfs/disk-io.h | 12 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 771 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 134 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 18 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 1 | ||||
-rw-r--r-- | fs/btrfs/file.c | 29 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 1 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 142 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 7 | ||||
-rw-r--r-- | fs/btrfs/ioctl.h | 14 | ||||
-rw-r--r-- | fs/btrfs/locking.c | 207 | ||||
-rw-r--r-- | fs/btrfs/locking.h | 6 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 4 | ||||
-rw-r--r-- | fs/btrfs/ref-cache.c | 1 | ||||
-rw-r--r-- | fs/btrfs/ref-cache.h | 1 | ||||
-rw-r--r-- | fs/btrfs/super.c | 14 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 6 | ||||
-rw-r--r-- | fs/btrfs/tree-defrag.c | 1 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 356 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 54 | ||||
-rw-r--r-- | fs/btrfs/xattr.c | 48 | ||||
-rw-r--r-- | fs/btrfs/xattr.h | 2 |
29 files changed, 1876 insertions, 612 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig new file mode 100644 index 00000000000..7bb3c020e57 --- /dev/null +++ b/fs/btrfs/Kconfig @@ -0,0 +1,31 @@ +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + +config BTRFS_FS_POSIX_ACL + bool "Btrfs POSIX Access Control Lists" + depends on BTRFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 8e2fec05dbe..c84ca1f5259 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -16,11 +16,11 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/version.h> #include <linux/kthread.h> #include <linux/list.h> #include <linux/spinlock.h> -# include <linux/freezer.h> +#include <linux/freezer.h> +#include <linux/ftrace.h> #include "async-thread.h" #define WORK_QUEUED_BIT 0 @@ -143,6 +143,7 @@ static int worker_loop(void *arg) struct btrfs_work *work; do { spin_lock_irq(&worker->lock); +again_locked: while (!list_empty(&worker->pending)) { cur = worker->pending.next; work = list_entry(cur, struct btrfs_work, list); @@ -165,14 +166,50 @@ static int worker_loop(void *arg) check_idle_worker(worker); } - worker->working = 0; if (freezing(current)) { + worker->working = 0; + spin_unlock_irq(&worker->lock); refrigerator(); } else { - set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) + if (!kthread_should_stop()) { + cpu_relax(); + /* + * we've dropped the lock, did someone else + * jump_in? + */ + smp_mb(); + if (!list_empty(&worker->pending)) + continue; + + /* + * this short schedule allows more work to + * come in without the queue functions + * needing to go through wake_up_process() + * + * worker->working is still 1, so nobody + * is going to try and wake us up + */ + schedule_timeout(1); + smp_mb(); + if (!list_empty(&worker->pending)) + continue; + + /* still no more work?, sleep for real */ + spin_lock_irq(&worker->lock); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&worker->pending)) + goto again_locked; + + /* + * this makes sure we get a wakeup when someone + * adds something new to the queue + */ + worker->working = 0; + spin_unlock_irq(&worker->lock); + schedule(); + } __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work) { struct btrfs_worker_thread *worker = work->worker; unsigned long flags; + int wake = 0; if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) goto out; spin_lock_irqsave(&worker->lock, flags); - atomic_inc(&worker->num_pending); list_add_tail(&work->list, &worker->pending); + atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle * list @@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work) &worker->workers->worker_list); spin_unlock_irqrestore(&worker->workers->lock, flags); } + if (!worker->working) { + wake = 1; + worker->working = 1; + } spin_unlock_irqrestore(&worker->lock, flags); - + if (wake) + wake_up_process(worker->task); out: + return 0; } @@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) } spin_lock_irqsave(&worker->lock, flags); + + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); - list_add_tail(&work->list, &worker->pending); /* * avoid calling into wake_up_process if this thread has already diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a8c9693b75a..72677ce2b74 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,9 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* the space_info for where this inode's data allocations are done */ + struct btrfs_space_info *space_info; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -94,6 +97,11 @@ struct btrfs_inode { */ u64 delalloc_bytes; + /* total number of bytes that may be used for this inode for + * delalloc + */ + u64 reserved_bytes; + /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee848d8585d..ab07627084f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,7 +32,6 @@ #include <linux/swap.h> #include <linux/writeback.h> #include <linux/bit_spinlock.h> -#include <linux/version.h> #include <linux/pagevec.h> #include "compat.h" #include "ctree.h" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9e46c077681..42491d728e9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -inline void btrfs_init_path(struct btrfs_path *p) -{ - memset(p, 0, sizeof(*p)); -} - struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; - path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); - if (path) { - btrfs_init_path(path); + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + if (path) path->reada = 1; - } return path; } +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_set_lock_blocking(p->nodes[i]); + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) +{ + int i; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { + if (p->nodes[i] && p->locks[i]) + btrfs_clear_lock_blocking(p->nodes[i]); + } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif +} + /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, level, &ins); BUG_ON(ret); cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len); + buf->len, level); } else { cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, @@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (IS_ERR(cow)) return PTR_ERR(cow); + /* cow is set to blocking by btrfs_init_new_buffer */ + copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); @@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - spin_lock(&root->fs_info->hash_lock); if (btrfs_header_generation(buf) == trans->transid && btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; - spin_unlock(&root->fs_info->hash_lock); WARN_ON(prealloc_dest); return 0; } - spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0, prealloc_dest); @@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems == 1) return 0; + btrfs_set_lock_blocking(parent); + for (i = start_slot; i < end_slot; i++) { int close = 1; @@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; mid = path->nodes[level]; + WARN_ON(!path->locks[level]); WARN_ON(btrfs_header_generation(mid) != trans->transid); @@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - btrfs_tree_lock(child); BUG_ON(!child); + btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); @@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); btrfs_tree_unlock(child); + path->locks[level] = 0; path->nodes[level] = NULL; clean_tree_block(trans, root, mid); @@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, 0); if (wret) { @@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = read_node_slot(root, parent, pslot + 1); if (right) { btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, 0); if (wret) { @@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, */ if (right) { u32 right_nr; + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root, struct btrfs_disk_key disk_key; u32 nritems; u64 search; - u64 lowest_read; - u64 highest_read; + u64 target; u64 nread = 0; int direction = path->reada; struct extent_buffer *eb; @@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root, return; } - highest_read = search; - lowest_read = search; + target = search; nritems = btrfs_header_nritems(node); nr = slot; @@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root, break; } search = btrfs_node_blockptr(node, nr); - if ((search >= lowest_read && search <= highest_read) || - (search < lowest_read && lowest_read - search <= 16384) || - (search > highest_read && search - highest_read <= 16384)) { + if ((search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { readahead_tree_block(root, search, blocksize, btrfs_node_ptr_generation(node, nr)); nread += blocksize; } nscan++; - if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + if ((nread > 65536 || nscan > 32)) break; + } +} - if (nread > (256 * 1024) || nscan > 128) - break; +/* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + int ret = 0; + int blocksize; + + parent = path->nodes[level - 1]; + if (!parent) + return 0; - if (search < lowest_read) - lowest_read = search; - if (search > highest_read) - highest_read = search; + nritems = btrfs_header_nritems(parent); + slot = path->slots[level]; + blocksize = btrfs_level_size(root, level); + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root, block1, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block1 = 0; + free_extent_buffer(eb); + } + if (slot < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root, block2, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block2 = 0; + free_extent_buffer(eb); } + if (block1 || block2) { + ret = -EAGAIN; + btrfs_release_path(root, path); + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); + + if (block1) { + eb = read_tree_block(root, block1, blocksize, 0); + free_extent_buffer(eb); + } + if (block1) { + eb = read_tree_block(root, block2, blocksize, 0); + free_extent_buffer(eb); + } + } + return ret; } + /* * when we walk down the tree, it is usually safe to unlock the higher layers * in the tree. The exceptions are when our path goes through slot 0, because @@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks || path->lowest_level) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; + } +} + +/* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf * level of the path (level 0) @@ -1387,32 +1524,30 @@ again: int wret; /* is a cow on this block not required */ - spin_lock(&root->fs_info->hash_lock); if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { - spin_unlock(&root->fs_info->hash_lock); goto cow_done; } - spin_unlock(&root->fs_info->hash_lock); /* ok, we have to cow, is our old prealloc the right * size? */ if (prealloc_block.objectid && prealloc_block.offset != b->len) { + btrfs_release_path(root, p); btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); prealloc_block.objectid = 0; + goto again; } /* * for higher level blocks, try not to allocate blocks * with the block and the parent locks held. */ - if (level > 1 && !prealloc_block.objectid && - btrfs_path_lock_waiting(p, level)) { + if (level > 0 && !prealloc_block.objectid) { u32 size = b->len; u64 hint = b->start; @@ -1425,6 +1560,8 @@ again: goto again; } + btrfs_set_path_blocking(p); + wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], @@ -1446,6 +1583,22 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; + btrfs_clear_path_blocking(p, NULL); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If cow is true, then we might be changing slot zero, + * which may require changing the parent. So, we can't + * drop the lock until after we know which slot we're + * operating on. + */ + if (!cow) + btrfs_unlock_up_safe(p, level + 1); + ret = check_block(root, p, level); if (ret) { ret = -1; @@ -1453,6 +1606,7 @@ cow_done: } ret = bin_search(b, key, level, &slot); + if (level != 0) { if (ret && slot > 0) slot -= 1; @@ -1460,7 +1614,16 @@ cow_done: if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret = split_node(trans, root, p, level); + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1468,9 +1631,19 @@ cow_done: } b = p->nodes[level]; slot = p->slots[level]; - } else if (ins_len < 0) { - int sret = balance_level(trans, root, p, - level); + } else if (ins_len < 0 && + btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + if (sret) { ret = sret; goto done; @@ -1504,7 +1677,7 @@ cow_done: * of the btree by dropping locks before * we read. */ - if (level > 1) { + if (level > 0) { btrfs_release_path(NULL, p); if (tmp) free_extent_buffer(tmp); @@ -1519,6 +1692,7 @@ cow_done: free_extent_buffer(tmp); goto again; } else { + btrfs_set_path_blocking(p); if (tmp) free_extent_buffer(tmp); if (should_reada) @@ -1528,14 +1702,29 @@ cow_done: b = read_node_slot(root, b, slot); } } - if (!p->skip_locking) - btrfs_tree_lock(b); + if (!p->skip_locking) { + int lret; + + btrfs_clear_path_blocking(p, NULL); + lret = btrfs_try_spin_lock(b); + + if (!lret) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b); + } + } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret = split_leaf(trans, root, key, + int sret; + + btrfs_set_path_blocking(p); + sret = split_leaf(trans, root, key, p, ins_len, ret == 0); + btrfs_clear_path_blocking(p, NULL); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1549,12 +1738,16 @@ cow_done: } ret = 1; done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + btrfs_set_path_blocking(p); if (prealloc_block.objectid) { btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); } - return ret; } @@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); BUG_ON(ret); + btrfs_set_lock_blocking(eb); + parent = eb; while (1) { level = btrfs_header_level(parent); @@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } /* @@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } ret = btrfs_cow_block(trans, root, eb, parent, slot, @@ -2172,6 +2369,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size) goto out_unlock; @@ -2367,6 +2566,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size) { ret = 1; @@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, path->keep_locks = 0; BUG_ON(ret); + /* + * make sure any changes to the path from split_leaf leave it + * in a blocking state + */ + btrfs_set_path_blocking(path); + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); @@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } out: + btrfs_unlock_up_safe(path, 1); return ret; } @@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, { int ret; u64 root_gen = btrfs_header_generation(path->nodes[1]); + u64 parent_start = path->nodes[1]->start; + u64 parent_owner = btrfs_header_owner(path->nodes[1]); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + ret = btrfs_free_extent(trans, root, bytenr, btrfs_level_size(root, 0), - path->nodes[1]->start, - btrfs_header_owner(path->nodes[1]), + parent_start, parent_owner, root_gen, 0, 1); return ret; } @@ -3721,6 +3936,7 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; + btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, cache_only, min_trans); if (sret == 0) { @@ -3738,16 +3954,20 @@ find_next_key: unlock_up(path, level, 1); goto out; } + btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); btrfs_tree_lock(cur); + path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); + btrfs_clear_path_blocking(path, NULL); } out: if (ret == 0) memcpy(min_key, &found_key, sizeof(found_key)); + btrfs_set_path_blocking(path); return ret; } @@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (ret < 0) return ret; + btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -3873,6 +4094,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } + /* the path was set to blocking above */ if (level == 1 && (path->locks[1] || path->skip_locking) && path->reada) reada_for_search(root, path, level, slot, 0); @@ -3881,6 +4103,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(c)); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } break; } @@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) path->locks[level] = 1; if (!level) break; + + btrfs_set_path_blocking(path); if (level == 1 && path->locks[1] && path->reada) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(path->nodes[level])); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } } done: @@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eee060f8811..82491ba8fa4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -43,11 +43,7 @@ struct btrfs_ordered_sum; #define BTRFS_ACL_NOT_CACHED ((void *)-1) -#ifdef CONFIG_LOCKDEP -# define BTRFS_MAX_LEVEL 7 -#else -# define BTRFS_MAX_LEVEL 8 -#endif +#define BTRFS_MAX_LEVEL 8 /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -454,17 +450,11 @@ struct btrfs_timespec { __le32 nsec; } __attribute__ ((__packed__)); -typedef enum { +enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LAST = 2, -} btrfs_compression_type; - -/* we don't understand any encryption methods right now */ -typedef enum { - BTRFS_ENCRYPTION_NONE = 0, - BTRFS_ENCRYPTION_LAST = 1, -} btrfs_encryption_type; +}; struct btrfs_inode_item { /* nfs style generation number */ @@ -606,13 +596,27 @@ struct btrfs_block_group_item { struct btrfs_space_info { u64 flags; - u64 total_bytes; - u64 bytes_used; - u64 bytes_pinned; - u64 bytes_reserved; - u64 bytes_readonly; - int full; - int force_alloc; + + u64 total_bytes; /* total bytes in the space */ + u64 bytes_used; /* total bytes used on disk */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + /* delalloc accounting */ + u64 bytes_delalloc; /* number of bytes reserved for allocation, + this space is not necessarily reserved yet + by the allocator */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc */ + + int full; /* indicates that we cannot allocate any more + chunks for this space */ + int force_alloc; /* set if we need to force a chunk alloc for + this space */ + struct list_head list; /* for block groups in our same type */ @@ -701,9 +705,7 @@ struct btrfs_fs_info { struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; - wait_queue_head_t async_submit_wait; - wait_queue_head_t tree_log_wait; struct btrfs_super_block super_copy; struct btrfs_super_block super_for_commit; @@ -711,7 +713,6 @@ struct btrfs_fs_info { struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - spinlock_t hash_lock; struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; @@ -730,10 +731,6 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; - atomic_t tree_log_writers; - atomic_t tree_log_commit; - unsigned long tree_log_batch; - u64 tree_log_transid; /* * this is used by the balancing code to wait for all the pending @@ -833,7 +830,14 @@ struct btrfs_root { struct kobject root_kobj; struct completion kobj_unregister; struct mutex objectid_mutex; + struct mutex log_mutex; + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + atomic_t log_writers; + atomic_t log_commit[2]; + unsigned long log_transid; + unsigned long log_batch; u64 objectid; u64 last_trans; @@ -1721,7 +1725,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr, u32 blocksize, + int level); int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 parent, u64 min_bytes, @@ -1791,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root); int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes); +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); /* ctree.c */ int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, @@ -1840,7 +1855,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); -void btrfs_init_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); + int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); int btrfs_del_leaf(struct btrfs_trans_handle *trans, @@ -2034,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, - int for_del); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 81a313874ae..adda739a021 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -16,7 +16,6 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/version.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/scatterlist.h> @@ -76,6 +75,40 @@ struct async_submit_bio { struct btrfs_work work; }; +/* These are used to set the lockdep class on the extent buffer locks. + * The class is set by the readpage_end_io_hook after the buffer has + * passed csum validation but before the pages are unlocked. + * + * The lockdep class is also set by btrfs_init_new_buffer on freshly + * allocated blocks. + * + * The class is based on the level in the tree block, which allows lockdep + * to know that lower nodes nest inside the locks of higher nodes. + * + * We also add a check to make sure the highest level of the tree is + * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this + * code needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif +static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; +static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { + /* leaf */ + "btrfs-extent-00", + "btrfs-extent-01", + "btrfs-extent-02", + "btrfs-extent-03", + "btrfs-extent-04", + "btrfs-extent-05", + "btrfs-extent-06", + "btrfs-extent-07", + /* highest possible level */ + "btrfs-extent-08", +}; +#endif + /* * extents on the btree inode are pretty simple, there's one extent * that covers the entire device @@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +{ + lockdep_set_class_and_name(&eb->lock, + &btrfs_eb_class[level], + btrfs_eb_name[level]); +} +#endif + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); + btrfs_set_buffer_lockdep_class(eb, found_level); + ret = csum_tree_block(root, eb, 1); if (ret) ret = -EIO; @@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) - buf->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); else WARN_ON(1); return buf; @@ -814,6 +858,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { WARN_ON(!btrfs_tree_locked(buf)); + + /* ugh, clear_extent_buffer_dirty can be expensive */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->list_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + init_waitqueue_head(&root->log_writer_wait); + init_waitqueue_head(&root->log_commit_wait[0]); + init_waitqueue_head(&root->log_commit_wait[1]); + atomic_set(&root->log_commit[0], 0); + atomic_set(&root->log_commit[1], 0); + atomic_set(&root->log_writers, 0); + root->log_batch = 0; + root->log_transid = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, return 0; } -int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct btrfs_root *tree_root = fs_info->tree_root; + struct extent_buffer *leaf; root = kzalloc(sizeof(*root), GFP_NOFS); if (!root) - return -ENOMEM; + return ERR_PTR(-ENOMEM); __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, @@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ root->ref_cows = 0; - root->node = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + kfree(root); + return ERR_CAST(leaf); + } + root->node = leaf; btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); btrfs_set_header_bytenr(root->node, root->node->start); @@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - fs_info->log_root_tree = root; + return root; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *log_root; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + WARN_ON(fs_info->log_root_tree); + fs_info->log_root_tree = log_root; + return 0; +} + +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *log_root; + struct btrfs_inode_item *inode_item; + + log_root = alloc_log_tree(trans, root->fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + log_root->last_trans = trans->transid; + log_root->root_key.offset = root->root_key.objectid; + + inode_item = &log_root->root_item.inode; + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); + btrfs_set_root_generation(&log_root->root_item, trans->transid); + + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; return 0; } @@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; int ret = 0; - struct list_head *cur; struct btrfs_device *device; struct backing_dev_info *bdi; #if 0 @@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) btrfs_congested_async(info, 0)) return 1; #endif - list_for_each(cur, &info->fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); @@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) */ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { - struct list_head *cur; struct btrfs_device *device; struct btrfs_fs_info *info; info = (struct btrfs_fs_info *)bdi->unplug_io_data; - list_for_each(cur, &info->fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - spin_lock_init(&fs_info->hash_lock); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); - init_waitqueue_head(&fs_info->tree_log_wait); - atomic_set(&fs_info->tree_log_commit, 0); - atomic_set(&fs_info->tree_log_writers, 0); - fs_info->tree_log_transid = 0; __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, * low idle thresh */ fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_meta_workers.idle_thresh = 4; + fs_info->endio_write_workers.idle_thresh = 64; fs_info->endio_meta_write_workers.idle_thresh = 64; @@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); dev_root->track_dirty = 1; - if (ret) goto fail_extent_root; @@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); - if (!fs_info->cleaner_kthread) + if (IS_ERR(fs_info->cleaner_kthread)) goto fail_csum_root; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); - if (!fs_info->transaction_kthread) + if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; if (btrfs_super_log_root(disk_super) != 0) { @@ -1828,13 +1929,14 @@ fail_sb_buffer: fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); -fail: + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + bdi_destroy(&fs_info->bdi); +fail: kfree(extent_root); kfree(tree_root); - bdi_destroy(&fs_info->bdi); kfree(fs_info); kfree(chunk_root); kfree(dev_root); @@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device, int write_all_supers(struct btrfs_root *root, int max_mirrors) { - struct list_head *cur; struct list_head *head = &root->fs_info->fs_devices->devices; struct btrfs_device *dev; struct btrfs_super_block *sb; @@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) sb = &root->fs_info->super_for_commit; dev_item = &sb->dev_item; - list_for_each(cur, head) { - dev = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; @@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } total_errors = 0; - list_for_each(cur, head) { - dev = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; if (!dev->in_fs_metadata || !dev->writeable) @@ -2260,6 +2359,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; + btrfs_set_lock_blocking(buf); + WARN_ON(!btrfs_tree_locked(buf)); if (transid != root->fs_info->generation) { printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " @@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int ret; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) - buf->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return ret; } int btree_lock_page_hook(struct page *page) { struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; unsigned long len; @@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page) goto out; btrfs_tree_lock(eb); - spin_lock(&root->fs_info->hash_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_unlock(&root->fs_info->hash_lock); btrfs_tree_unlock(eb); free_extent_buffer(eb); out: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c0ff404c31b..95029db227b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btree_lock_page_hook(struct page *page); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +#else +static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, + int level) +{ +} +#endif #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 293da650873..6b5966aacf4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -19,7 +19,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> -#include <linux/version.h> +#include <linux/sort.h> #include "compat.h" #include "hash.h" #include "crc32c.h" @@ -30,7 +30,6 @@ #include "volumes.h" #include "locking.h" #include "ref-cache.h" -#include "compat.h" #define PENDING_EXTENT_INSERT 0 #define PENDING_EXTENT_DELETE 1 @@ -61,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -326,10 +329,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, u64 flags) { struct list_head *head = &info->space_info; - struct list_head *cur; struct btrfs_space_info *found; - list_for_each(cur, head) { - found = list_entry(cur, struct btrfs_space_info, list); + list_for_each_entry(found, head, list) { if (found->flags == flags) return found; } @@ -1326,8 +1327,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + u64 start; + u64 end; + int ret; + + while(1) { + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + + /* is there more work to do? */ + ret = find_first_extent_bit(&root->fs_info->pending_del, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + ret = find_first_extent_bit(&root->fs_info->extent_ins, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + break; + } return 0; } @@ -1525,15 +1543,55 @@ out: return ret; } -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, struct extent_buffer *buf, - u32 *nr_extents) +/* when a block goes through cow, we update the reference counts of + * everything that block points to. The internal pointers of the block + * can be in just about any order, and it is likely to have clusters of + * things that are close together and clusters of things that are not. + * + * To help reduce the seeks that come with updating all of these reference + * counts, sort them by byte number before actual updates are done. + * + * struct refsort is used to match byte number to slot in the btree block. + * we sort based on the byte number and then use the slot to actually + * find the item. + * + * struct refsort is smaller than strcut btrfs_item and smaller than + * struct btrfs_key_ptr. Since we're currently limited to the page size + * for a btree block, there's no way for a kmalloc of refsorts for a + * single node to be bigger than a page. + */ +struct refsort { + u64 bytenr; + u32 slot; +}; + +/* + * for passing into sort() + */ +static int refsort_cmp(const void *a_void, const void *b_void) +{ + const struct refsort *a = a_void; + const struct refsort *b = b_void; + + if (a->bytenr < b->bytenr) + return -1; + if (a->bytenr > b->bytenr) + return 1; + return 0; +} + + +noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *orig_buf, + struct extent_buffer *buf, u32 *nr_extents) { u64 bytenr; u64 ref_root; u64 orig_root; u64 ref_generation; u64 orig_generation; + struct refsort *sorted; u32 nritems; u32 nr_file_extents = 0; struct btrfs_key key; @@ -1542,6 +1600,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int level; int ret = 0; int faili = 0; + int refi = 0; + int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, u64, u64); @@ -1553,6 +1613,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); + sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); + BUG_ON(!sorted); + if (root->ref_cows) { process_func = __btrfs_inc_extent_ref; } else { @@ -1565,6 +1628,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, process_func = __btrfs_update_extent_ref; } + /* + * we make two passes through the items. In the first pass we + * only record the byte number and slot. Then we sort based on + * byte number and do the actual work based on the sorted results + */ for (i = 0; i < nritems; i++) { cond_resched(); if (level == 0) { @@ -1581,6 +1649,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, continue; nr_file_extents++; + sorted[refi].bytenr = bytenr; + sorted[refi].slot = i; + refi++; + } else { + bytenr = btrfs_node_blockptr(buf, i); + sorted[refi].bytenr = bytenr; + sorted[refi].slot = i; + refi++; + } + } + /* + * if refi == 0, we didn't actually put anything into the sorted + * array and we're done + */ + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + for (i = 0; i < refi; i++) { + cond_resched(); + slot = sorted[i].slot; + bytenr = sorted[i].bytenr; + + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, slot); ret = process_func(trans, root, bytenr, orig_buf->start, buf->start, @@ -1589,25 +1683,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, key.objectid); if (ret) { - faili = i; + faili = slot; WARN_ON(1); goto fail; } } else { - bytenr = btrfs_node_blockptr(buf, i); ret = process_func(trans, root, bytenr, orig_buf->start, buf->start, orig_root, ref_root, orig_generation, ref_generation, level - 1); if (ret) { - faili = i; + faili = slot; WARN_ON(1); goto fail; } } } out: + kfree(sorted); if (nr_extents) { if (level == 0) *nr_extents = nr_file_extents; @@ -1616,6 +1710,7 @@ out: } return 0; fail: + kfree(sorted); WARN_ON(1); return ret; } @@ -1818,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; + found->bytes_delalloc = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -1881,6 +1977,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return flags; } +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) +{ + struct btrfs_fs_info *info = root->fs_info; + u64 alloc_profile; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } + + return btrfs_reduce_alloc_profile(root, data); +} + +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) +{ + u64 alloc_target; + + alloc_target = btrfs_get_alloc_profile(root, 1); + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + alloc_target); +} + +/* + * for now this just makes sure we have at least 5% of our metadata space free + * for use. + */ +int btrfs_check_metadata_free_space(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 alloc_target, thresh; + int committed = 0, ret; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + +again: + spin_lock(&meta_sinfo->lock); + if (!meta_sinfo->full) + thresh = meta_sinfo->total_bytes * 80; + else + thresh = meta_sinfo->total_bytes * 95; + + do_div(thresh, 100); + + if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { + struct btrfs_trans_handle *trans; + if (!meta_sinfo->full) { + meta_sinfo->force_alloc = 1; + spin_unlock(&meta_sinfo->lock); + + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, alloc_target, 0); + btrfs_end_transaction(trans, root); + goto again; + } + spin_unlock(&meta_sinfo->lock); + + if (!committed) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + return -ENOSPC; + } + spin_unlock(&meta_sinfo->lock); + + return 0; +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + int ret = 0, committed = 0; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + if (data_sinfo->total_bytes - data_sinfo->bytes_used - + data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - + data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - + data_sinfo->bytes_may_use < bytes) { + struct btrfs_trans_handle *trans; + + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, 0); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + goto again; + } + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ + if (!committed) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + + printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" + ", %llu bytes_used, %llu bytes_reserved, " + "%llu bytes_pinned, %llu bytes_readonly, %llu may use" + "%llu total\n", bytes, data_sinfo->bytes_delalloc, + data_sinfo->bytes_used, data_sinfo->bytes_reserved, + data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, + data_sinfo->bytes_may_use, data_sinfo->total_bytes); + return -ENOSPC; + } + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + + return btrfs_check_metadata_free_space(root); +} + +/* + * if there was an error for whatever reason after calling + * btrfs_check_data_free_space, call this so we can cleanup the counters. + */ +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + spin_unlock(&data_sinfo->lock); +} + +/* called when we are adding a delalloc extent to the inode's io_tree */ +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* get the space info for where this inode will be storing its data */ + data_sinfo = BTRFS_I(inode)->space_info; + + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_delalloc += bytes; + + /* + * we are adding a delalloc extent without calling + * btrfs_check_data_free_space first. This happens on a weird + * writepage condition, but shouldn't hurt our accounting + */ + if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { + data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; + BTRFS_I(inode)->reserved_bytes = 0; + } else { + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + } + + spin_unlock(&data_sinfo->lock); +} + +/* called when we are clearing an delalloc extent from the inode's io_tree */ +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *info; + + info = BTRFS_I(inode)->space_info; + + spin_lock(&info->lock); + info->bytes_delalloc -= bytes; + spin_unlock(&info->lock); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) @@ -2137,13 +2460,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, u64 end; u64 priv; u64 search = 0; - u64 skipped = 0; struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_path *path; struct pending_extent_op *extent_op, *tmp; struct list_head insert_list, update_list; int ret; - int num_inserts = 0, max_inserts; + int num_inserts = 0, max_inserts, restart = 0; path = btrfs_alloc_path(); INIT_LIST_HEAD(&insert_list); @@ -2159,18 +2481,19 @@ again: ret = find_first_extent_bit(&info->extent_ins, search, &start, &end, EXTENT_WRITEBACK); if (ret) { - if (skipped && all && !num_inserts) { - skipped = 0; + if (restart && !num_inserts && + list_empty(&update_list)) { + restart = 0; search = 0; continue; } - mutex_unlock(&info->extent_ins_mutex); break; } ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); if (!ret) { - skipped = 1; + if (all) + restart = 1; search = end + 1; if (need_resched()) { mutex_unlock(&info->extent_ins_mutex); @@ -2189,7 +2512,7 @@ again: list_add_tail(&extent_op->list, &insert_list); search = end + 1; if (num_inserts == max_inserts) { - mutex_unlock(&info->extent_ins_mutex); + restart = 1; break; } } else if (extent_op->type == PENDING_BACKREF_UPDATE) { @@ -2205,7 +2528,6 @@ again: * somebody marked this thing for deletion then just unlock it and be * done, the free_extents will handle it */ - mutex_lock(&info->extent_ins_mutex); list_for_each_entry_safe(extent_op, tmp, &update_list, list) { clear_extent_bits(&info->extent_ins, extent_op->bytenr, extent_op->bytenr + extent_op->num_bytes - 1, @@ -2227,6 +2549,10 @@ again: if (!list_empty(&update_list)) { ret = update_backrefs(trans, extent_root, path, &update_list); BUG_ON(ret); + + /* we may have COW'ed new blocks, so lets start over */ + if (all) + restart = 1; } /* @@ -2234,9 +2560,9 @@ again: * need to make sure everything is cleaned then reset everything and * go back to the beginning */ - if (!num_inserts && all && skipped) { + if (!num_inserts && restart) { search = 0; - skipped = 0; + restart = 0; INIT_LIST_HEAD(&update_list); INIT_LIST_HEAD(&insert_list); goto again; @@ -2293,27 +2619,19 @@ again: BUG_ON(ret); /* - * if we broke out of the loop in order to insert stuff because we hit - * the maximum number of inserts at a time we can handle, then loop - * back and pick up where we left off - */ - if (num_inserts == max_inserts) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - num_inserts = 0; - goto again; - } - - /* - * again, if we need to make absolutely sure there are no more pending - * extent operations left and we know that we skipped some, go back to - * the beginning and do it all again + * if restart is set for whatever reason we need to go back and start + * searching through the pending list again. + * + * We just inserted some extents, which could have resulted in new + * blocks being allocated, which would result in new blocks needing + * updates, so if all is set we _must_ restart to get the updated + * blocks. */ - if (all && skipped) { + if (restart || all) { INIT_LIST_HEAD(&insert_list); INIT_LIST_HEAD(&update_list); search = 0; - skipped = 0; + restart = 0; num_inserts = 0; goto again; } @@ -2547,6 +2865,7 @@ again: if (ret) { if (all && skipped && !nr) { search = 0; + skipped = 0; continue; } mutex_unlock(&info->extent_ins_mutex); @@ -2633,6 +2952,8 @@ again: goto again; } + if (!err) + finish_current_insert(trans, extent_root, 0); return err; } @@ -2700,13 +3021,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* if metadata always pin */ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_block_group_cache *cache; - - /* btrfs_free_reserved_extent */ - cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); - btrfs_add_free_space(cache, bytenr, num_bytes); - put_block_group(cache); + mutex_lock(&root->fs_info->pinned_mutex); + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + mutex_unlock(&root->fs_info->pinned_mutex); update_reserved_extents(root, bytenr, num_bytes, 0); return 0; } @@ -2787,7 +3104,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (data & BTRFS_BLOCK_GROUP_METADATA) { last_ptr = &root->fs_info->last_alloc; - empty_cluster = 64 * 1024; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; } if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) @@ -3014,16 +3332,18 @@ loop_check: static void dump_space_info(struct btrfs_space_info *info, u64 bytes) { struct btrfs_block_group_cache *cache; - struct list_head *l; printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," + " may_use=%llu, used=%llu\n", info->total_bytes, + info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, + info->bytes_used); down_read(&info->groups_sem); - list_for_each(l, &info->block_groups) { - cache = list_entry(l, struct btrfs_block_group_cache, list); + list_for_each_entry(cache, &info->block_groups, list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used " "%llu pinned %llu reserved\n", @@ -3047,24 +3367,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - u64 alloc_profile; struct btrfs_fs_info *info = root->fs_info; - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } + data = btrfs_get_alloc_profile(root, data); again: - data = btrfs_reduce_alloc_profile(root, data); /* * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations @@ -3332,7 +3638,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr, u32 blocksize, + int level) { struct extent_buffer *buf; @@ -3340,9 +3647,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + + btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -3351,6 +3662,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, buf->start + buf->len - 1, GFP_NOFS); } trans->blocks_used++; + /* this returns a buffer locked for blocking */ return buf; } @@ -3379,7 +3691,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, + blocksize, level); return buf; } @@ -3388,36 +3701,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, { u64 leaf_owner; u64 leaf_generation; + struct refsort *sorted; struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int nritems; int ret; + int refi = 0; + int slot; BUG_ON(!btrfs_is_leaf(leaf)); nritems = btrfs_header_nritems(leaf); leaf_owner = btrfs_header_owner(leaf); leaf_generation = btrfs_header_generation(leaf); + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + /* we do this loop twice. The first time we build a list + * of the extents we have a reference on, then we sort the list + * by bytenr. The second time around we actually do the + * extent freeing. + */ for (i = 0; i < nritems; i++) { u64 disk_bytenr; cond_resched(); btrfs_item_key_to_cpu(leaf, &key, i); + + /* only extents have references, skip everything else */ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + + /* inline extents live in the btree, they don't have refs */ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - /* - * FIXME make sure to insert a trans record that - * repeats the snapshot del on crash - */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + + /* holes don't have refs */ if (disk_bytenr == 0) continue; + sorted[refi].bytenr = disk_bytenr; + sorted[refi].slot = i; + refi++; + } + + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + for (i = 0; i < refi; i++) { + u64 disk_bytenr; + + disk_bytenr = sorted[i].bytenr; + slot = sorted[i].slot; + + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + ret = __btrfs_free_extent(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(leaf, fi), leaf->start, leaf_owner, leaf_generation, @@ -3428,6 +3778,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, wake_up(&root->fs_info->transaction_throttle); cond_resched(); } +out: + kfree(sorted); return 0; } @@ -3437,9 +3789,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, { int i; int ret; - struct btrfs_extent_info *info = ref->extents; + struct btrfs_extent_info *info; + struct refsort *sorted; + + if (ref->nritems == 0) + return 0; + sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); for (i = 0; i < ref->nritems; i++) { + sorted[i].bytenr = ref->extents[i].bytenr; + sorted[i].slot = i; + } + sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); + + /* + * the items in the ref were sorted when the ref was inserted + * into the ref cache, so this is already in order + */ + for (i = 0; i < ref->nritems; i++) { + info = ref->extents + sorted[i].slot; ret = __btrfs_free_extent(trans, root, info->bytenr, info->num_bytes, ref->bytenr, ref->owner, ref->generation, @@ -3453,6 +3821,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, info++; } + kfree(sorted); return 0; } @@ -3497,6 +3866,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, } /* + * this is used while deleting old snapshots, and it drops the refs + * on a whole subtree starting from a level 1 node. + * + * The idea is to sort all the leaf pointers, and then drop the + * ref on all the leaves in order. Most of the time the leaves + * will have ref cache entries, so no leaf IOs will be required to + * find the extents they have references on. + * + * For each leaf, any references it has are also dropped in order + * + * This ends up dropping the references in something close to optimal + * order for reading and modifying the extent allocation tree. + */ +static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + u64 bytenr; + u64 root_owner; + u64 root_gen; + struct extent_buffer *eb = path->nodes[1]; + struct extent_buffer *leaf; + struct btrfs_leaf_ref *ref; + struct refsort *sorted = NULL; + int nritems = btrfs_header_nritems(eb); + int ret; + int i; + int refi = 0; + int slot = path->slots[1]; + u32 blocksize = btrfs_level_size(root, 0); + u32 refs; + + if (nritems == 0) + goto out; + + root_owner = btrfs_header_owner(eb); + root_gen = btrfs_header_generation(eb); + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + + /* + * step one, sort all the leaf pointers so we don't scribble + * randomly into the extent allocation tree + */ + for (i = slot; i < nritems; i++) { + sorted[refi].bytenr = btrfs_node_blockptr(eb, i); + sorted[refi].slot = i; + refi++; + } + + /* + * nritems won't be zero, but if we're picking up drop_snapshot + * after a crash, slot might be > 0, so double check things + * just in case. + */ + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + /* + * the first loop frees everything the leaves point to + */ + for (i = 0; i < refi; i++) { + u64 ptr_gen; + + bytenr = sorted[i].bytenr; + + /* + * check the reference count on this leaf. If it is > 1 + * we just decrement it below and don't update any + * of the refs the leaf points to. + */ + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + BUG_ON(ret); + if (refs != 1) + continue; + + ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); + + /* + * the leaf only had one reference, which means the + * only thing pointing to this leaf is the snapshot + * we're deleting. It isn't possible for the reference + * count to increase again later + * + * The reference cache is checked for the leaf, + * and if found we'll be able to drop any refs held by + * the leaf without needing to read it in. + */ + ref = btrfs_lookup_leaf_ref(root, bytenr); + if (ref && ref->generation != ptr_gen) { + btrfs_free_leaf_ref(root, ref); + ref = NULL; + } + if (ref) { + ret = cache_drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(root, ref); + } else { + /* + * the leaf wasn't in the reference cache, so + * we have to read it. + */ + leaf = read_tree_block(root, bytenr, blocksize, + ptr_gen); + ret = btrfs_drop_leaf_ref(trans, root, leaf); + BUG_ON(ret); + free_extent_buffer(leaf); + } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } + + /* + * run through the loop again to free the refs on the leaves. + * This is faster than doing it in the loop above because + * the leaves are likely to be clustered together. We end up + * working in nice chunks on the extent allocation tree. + */ + for (i = 0; i < refi; i++) { + bytenr = sorted[i].bytenr; + ret = __btrfs_free_extent(trans, root, bytenr, + blocksize, eb->start, + root_owner, root_gen, 0, 1); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } +out: + kfree(sorted); + + /* + * update the path to show we've processed the entire level 1 + * node. This will get saved into the root's drop_snapshot_progress + * field so these drops are not repeated again if this transaction + * commits. + */ + path->slots[1] = nritems; + return 0; +} + +/* * helper function for drop_snapshot, this walks down the tree dropping ref * counts as it goes. */ @@ -3511,7 +4026,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; - struct btrfs_leaf_ref *ref; u32 blocksize; int ret; u32 refs; @@ -3538,17 +4052,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, if (path->slots[*level] >= btrfs_header_nritems(cur)) break; + + /* the new code goes down to level 1 and does all the + * leaves pointed to that node in bulk. So, this check + * for level 0 will always be false. + * + * But, the disk format allows the drop_snapshot_progress + * field in the root to leave things in a state where + * a leaf will need cleaning up here. If someone crashes + * with the old code and then boots with the new code, + * we might find a leaf here. + */ if (*level == 0) { ret = btrfs_drop_leaf_ref(trans, root, cur); BUG_ON(ret); break; } + + /* + * once we get to level one, process the whole node + * at once, including everything below it. + */ + if (*level == 1) { + ret = drop_level_one_refs(trans, root, path); + BUG_ON(ret); + break; + } + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); BUG_ON(ret); + + /* + * if there is more than one reference, we don't need + * to read that node to drop any references it has. We + * just drop the ref we hold on that node and move on to the + * next slot in this level. + */ if (refs != 1) { parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); @@ -3567,46 +4110,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, continue; } + /* - * at this point, we have a single ref, and since the - * only place referencing this extent is a dead root - * the reference count should never go higher. - * So, we don't need to check it again + * we need to keep freeing things in the next level down. + * read the block and loop around to process it */ - if (*level == 1) { - ref = btrfs_lookup_leaf_ref(root, bytenr); - if (ref && ref->generation != ptr_gen) { - btrfs_free_leaf_ref(root, ref); - ref = NULL; - } - if (ref) { - ret = cache_drop_leaf_ref(trans, root, ref); - BUG_ON(ret); - btrfs_remove_leaf_ref(root, ref); - btrfs_free_leaf_ref(root, ref); - *level = 0; - break; - } - } - next = btrfs_find_tree_block(root, bytenr, blocksize); - if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { - free_extent_buffer(next); - - next = read_tree_block(root, bytenr, blocksize, - ptr_gen); - cond_resched(); -#if 0 - /* - * this is a debugging check and can go away - * the ref should never go all the way down to 1 - * at this point - */ - ret = lookup_extent_ref(NULL, root, bytenr, blocksize, - &refs); - BUG_ON(ret); - WARN_ON(refs != 1); -#endif - } + next = read_tree_block(root, bytenr, blocksize, ptr_gen); WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); @@ -3631,11 +4140,16 @@ out: root_owner = btrfs_header_owner(parent); root_gen = btrfs_header_generation(parent); + /* + * cleanup and free the reference on the last node + * we processed + */ ret = __btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level, 1); free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; + *level += 1; BUG_ON(ret); @@ -3687,6 +4201,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, next = read_tree_block(root, bytenr, blocksize, ptr_gen); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, &refs); @@ -3754,6 +4269,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { struct extent_buffer *node; struct btrfs_disk_key disk_key; + + /* + * there is more work to do in this level. + * Update the drop_progress marker to reflect + * the work we've done so far, and then bump + * the slot number + */ node = path->nodes[i]; path->slots[i]++; *level = i; @@ -3765,6 +4287,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, return 0; } else { struct extent_buffer *parent; + + /* + * this whole node is done, free our reference + * on it and go up one level + */ if (path->nodes[*level] == root->node) parent = path->nodes[*level]; else @@ -4444,7 +4971,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans, u64 lock_end = 0; u64 num_bytes; u64 ext_offset; - u64 first_pos; + u64 search_end = (u64)-1; u32 nritems; int nr_scaned = 0; int extent_locked = 0; @@ -4452,7 +4979,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans, int ret; memcpy(&key, leaf_key, sizeof(key)); - first_pos = INT_LIMIT(loff_t) - extent_key->offset; if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { if (key.objectid < ref_path->owner_objectid || (key.objectid == ref_path->owner_objectid && @@ -4501,7 +5027,7 @@ next: if ((key.objectid > ref_path->owner_objectid) || (key.objectid == ref_path->owner_objectid && key.type > BTRFS_EXTENT_DATA_KEY) || - (key.offset >= first_pos + extent_key->offset)) + key.offset >= search_end) break; } @@ -4534,8 +5060,10 @@ next: num_bytes = btrfs_file_extent_num_bytes(leaf, fi); ext_offset = btrfs_file_extent_offset(leaf, fi); - if (first_pos > key.offset - ext_offset) - first_pos = key.offset - ext_offset; + if (search_end == (u64)-1) { + search_end = key.offset - ext_offset + + btrfs_file_extent_ram_bytes(leaf, fi); + } if (!extent_locked) { lock_start = key.offset; @@ -4724,7 +5252,7 @@ next: } skip: if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && - key.offset >= first_pos + extent_key->offset) + key.offset >= search_end) break; cond_resched(); @@ -4778,6 +5306,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, ref->bytenr = buf->start; ref->owner = btrfs_header_owner(buf); ref->generation = btrfs_header_generation(buf); + ret = btrfs_add_leaf_ref(root, ref, 0); WARN_ON(ret); btrfs_free_leaf_ref(root, ref); @@ -5351,7 +5880,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, prev_block = block_start; } + mutex_lock(&extent_root->fs_info->trans_mutex); btrfs_record_root_in_trans(found_root); + mutex_unlock(&extent_root->fs_info->trans_mutex); if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { /* * try to update data extent references while @@ -5957,9 +6488,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - btrfs_remove_free_space_cache(block_group); + spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e086d407f1f..ebe6b29e606 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -9,7 +9,6 @@ #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> -#include <linux/version.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include "extent_io.h" @@ -31,7 +30,7 @@ static LIST_HEAD(buffers); static LIST_HEAD(states); #define LEAK_DEBUG 0 -#ifdef LEAK_DEBUG +#if LEAK_DEBUG static DEFINE_SPINLOCK(leak_lock); #endif @@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif @@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&state->leak_list, &states); spin_unlock_irqrestore(&leak_lock, flags); @@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif WARN_ON(state->tree); -#ifdef LEAK_DEBUG +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); @@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); free_extent_state(prealloc); return -EEXIST; } @@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, int scanned = 0; int range_whole = 0; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ @@ -2855,6 +2847,98 @@ out: return sector; } +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent) +{ + int ret; + u64 off = start; + u64 max = start + len; + u32 flags = 0; + u64 disko = 0; + struct extent_map *em = NULL; + int end = 0; + u64 em_start = 0, em_len = 0; + unsigned long emflags; + ret = 0; + + if (len == 0) + return -EINVAL; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, + GFP_NOFS); + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + while (!end) { + off = em->start + em->len; + if (off >= max) + end = 1; + + em_start = em->start; + em_len = em->len; + + disko = 0; + flags = 0; + + switch (em->block_start) { + case EXTENT_MAP_LAST_BYTE: + end = 1; + flags |= FIEMAP_EXTENT_LAST; + break; + case EXTENT_MAP_HOLE: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case EXTENT_MAP_INLINE: + flags |= (FIEMAP_EXTENT_DATA_INLINE | + FIEMAP_EXTENT_NOT_ALIGNED); + break; + case EXTENT_MAP_DELALLOC: + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + break; + default: + disko = em->block_start; + break; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + flags |= FIEMAP_EXTENT_ENCODED; + + emflags = em->flags; + free_extent_map(em); + em = NULL; + + if (!end) { + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + emflags = em->flags; + } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } +out_free: + free_extent_map(em); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, + GFP_NOFS); + return ret; +} + static inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { @@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, gfp_t mask) { struct extent_buffer *eb = NULL; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); eb->start = start; eb->len = len; - mutex_init(&eb->mutex); -#ifdef LEAK_DEBUG + spin_lock_init(&eb->lock); + init_waitqueue_head(&eb->lock_wq); + +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); @@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, static void __free_extent_buffer(struct extent_buffer *eb) { -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; spin_lock_irqsave(&leak_lock, flags); list_del(&eb->leak_list); @@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, unlock_page(p); } if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); spin_lock(&tree->buffer_lock); exists = buffer_tree_insert(tree, start, &eb->rb_node); @@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, unsigned long num_pages; num_pages = num_extent_pages(eb->start, eb->len); - eb->flags &= ~EXTENT_UPTODATE; + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); @@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, struct page *page; int pg_uptodate = 1; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned long bio_flags = 0; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (all_uptodate) { if (start_i == 0) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); goto unlock_exit; } @@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (!ret) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, unmap_extent_buffer(eb, eb->map_token, km); eb->map_token = NULL; save = 1; - WARN_ON(!mutex_is_locked(&eb->mutex)); } err = map_private_extent_buffer(eb, start, min_len, token, map, map_start, map_len, km); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c5b483a7913..1f9df88afbf 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -22,6 +22,10 @@ /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_BLOCKING 1 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -95,11 +99,19 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; struct page *first_page; + unsigned long bflags; atomic_t refs; - int flags; struct list_head leak_list; struct rb_node rb_node; - struct mutex mutex; + + /* the spinlock is used to protect most operations */ + spinlock_t lock; + + /* + * when we keep the lock held while blocking, waiters go onto + * the wq + */ + wait_queue_head_t lock_wq; }; struct extent_map_tree; @@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree, unsigned from, unsigned to); sector_t extent_bmap(struct address_space *mapping, sector_t iblock, get_extent_t *get_extent); +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent); int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a83e33ada3..50da69da20c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,7 +3,6 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/spinlock.h> -#include <linux/version.h> #include <linux/hardirq.h> #include "extent_map.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 90268334145..dc78954861b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -29,7 +29,6 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> -#include <linux/version.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1092,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_free_space(root, write_bytes, 0); + ret = btrfs_check_data_free_space(root, inode, write_bytes); if (ret) goto out; ret = prepare_pages(root, file, pages, num_pages, pos, first_index, last_index, write_bytes); - if (ret) + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); goto out; + } ret = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, buf); if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); btrfs_drop_pages(pages, num_pages); goto out; } @@ -1112,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, ret = dirty_and_release_pages(NULL, root, file, pages, num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); - if (ret) + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); goto out; + } if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, @@ -1137,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } out: mutex_unlock(&inode->i_mutex); + if (ret) + err = ret; out_nolock: kfree(pages); @@ -1215,15 +1224,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->fs_info->tree_log_batch++; + root->log_batch++; filemap_fdatawrite(inode->i_mapping); btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->fs_info->tree_log_batch++; + root->log_batch++; /* * ok we haven't committed the transaction yet, lets do a commit */ - if (file->private_data) + if (file && file->private_data) btrfs_ioctl_trans_end(file); trans = btrfs_start_transaction(root, 1); @@ -1232,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + ret = btrfs_log_dentry_safe(trans, root, dentry); if (ret < 0) goto out; @@ -1246,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); if (ret > 0) { ret = btrfs_commit_transaction(trans, root); @@ -1254,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_sync_log(trans, root); ret = btrfs_end_transaction(trans, root); } - mutex_lock(&file->f_dentry->d_inode->i_mutex); + mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? EIO : ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2aa79873eb4..cc7334d833c 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, search_key.type = 0; search_key.offset = 0; - btrfs_init_path(path); start_found = 0; ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); if (ret < 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8adfe059ab4..7d4f948bc22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -34,7 +34,6 @@ #include <linux/statfs.h> #include <linux/compat.h> #include <linux/bit_spinlock.h> -#include <linux/version.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> @@ -51,6 +50,7 @@ #include "tree-log.h" #include "ref-cache.h" #include "compression.h" +#include "locking.h" struct btrfs_iget_args { u64 ino; @@ -91,32 +91,14 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -/* - * a very lame attempt at stopping writes when the FS is 85% full. There - * are countless ways this is incorrect, but it is better than nothing. - */ -int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, - int for_del) +static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) { - u64 total; - u64 used; - u64 thresh; - int ret = 0; - - spin_lock(&root->fs_info->delalloc_lock); - total = btrfs_super_total_bytes(&root->fs_info->super_copy); - used = btrfs_super_bytes_used(&root->fs_info->super_copy); - if (for_del) - thresh = total * 90; - else - thresh = total * 85; - - do_div(thresh, 100); + int err; - if (used + root->fs_info->delalloc_bytes + num_required > thresh) - ret = -ENOSPC; - spin_unlock(&root->fs_info->delalloc_lock); - return ret; + err = btrfs_init_acl(inode, dir); + if (!err) + err = btrfs_xattr_security_init(inode, dir); + return err; } /* @@ -350,6 +332,19 @@ again: nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + total_compressed = actual_end - start; /* we want to make sure that amount of ram required to uncompress @@ -494,6 +489,7 @@ again: goto again; } } else { +cleanup_and_bail_uncompressed: /* * No compression, but we still need to write the pages in * the file we've been given so far. redirty the locked @@ -1166,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; root->fs_info->delalloc_bytes += end - start + 1; @@ -1199,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, (unsigned long long)end - start + 1, (unsigned long long) root->fs_info->delalloc_bytes); + btrfs_delalloc_free_space(root, inode, (u64)-1); root->fs_info->delalloc_bytes = 0; BTRFS_I(inode)->delalloc_bytes = 0; } else { + btrfs_delalloc_free_space(root, inode, + end - start + 1); root->fs_info->delalloc_bytes -= end - start + 1; BTRFS_I(inode)->delalloc_bytes -= end - start + 1; } @@ -1324,12 +1324,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_offset, struct list_head *list) { - struct list_head *cur; struct btrfs_ordered_sum *sum; btrfs_set_trans_block_group(trans, inode); - list_for_each(cur, list) { - sum = list_entry(cur, struct btrfs_ordered_sum, list); + + list_for_each_entry(sum, list, list) { btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); } @@ -2013,6 +2012,7 @@ void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2039,6 +2039,7 @@ void btrfs_read_locked_inode(struct inode *inode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; break; default: + inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } @@ -2108,6 +2109,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, goto failed; } + btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -2219,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; - ret = btrfs_check_free_space(root, 1, 1); - if (ret) - goto fail; - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); @@ -2235,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2258,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } - ret = btrfs_check_free_space(root, 1, 1); - if (ret) - goto fail; - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); @@ -2278,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) fail_trans: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); -fail: btrfs_btree_balance_dirty(root, nr); if (ret && !err) @@ -2429,6 +2421,8 @@ next_node: ref->generation = leaf_gen; ref->nritems = 0; + btrfs_sort_leaf_ref(ref); + ret = btrfs_add_leaf_ref(root, ref, 0); WARN_ON(ret); btrfs_free_leaf_ref(root, ref); @@ -2476,7 +2470,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - u32 found_type; + u32 found_type = (u8)-1; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; u64 extent_start = 0; @@ -2503,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.offset = (u64)-1; key.type = (u8)-1; - btrfs_init_path(path); - search_again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -2663,6 +2655,8 @@ next: if (pending_del_nr) goto del_pending; btrfs_release_path(root, path); + if (found_type == BTRFS_INODE_ITEM_KEY) + break; goto search_again; } @@ -2679,6 +2673,8 @@ del_pending: BUG_ON(ret); pending_del_nr = 0; btrfs_release_path(root, path); + if (found_type == BTRFS_INODE_ITEM_KEY) + break; goto search_again; } } @@ -2788,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) return err; @@ -2984,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->last_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; + bi->reserved_bytes = 0; bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; @@ -3005,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) inode->i_ino = args->ino; init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; + btrfs_set_inode_space_info(args->root, inode); return 0; } @@ -3265,7 +3263,7 @@ skip: /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) - filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); + filp->f_pos = INT_LIMIT(off_t); else filp->f_pos++; nopos: @@ -3425,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) owner = 0; @@ -3458,7 +3457,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, root->highest_inode = objectid; inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + + if (dir && (dir->i_mode & S_ISGID)) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; inode->i_ino = objectid; inode_set_bytes(inode, 0); @@ -3565,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; @@ -3586,7 +3592,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -3628,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; trans = btrfs_start_transaction(root, 1); @@ -3649,7 +3655,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -3696,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -ENOENT; btrfs_inc_nlink(inode); - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; err = btrfs_set_inode_index(dir, &index); @@ -3742,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto out_unlock; @@ -3772,7 +3778,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) goto out_fail; @@ -4158,9 +4164,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return -EINVAL; } -static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) { - return extent_bmap(mapping, iblock, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); } int btrfs_readpage(struct file *file, struct page *page) @@ -4223,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - return __btrfs_releasepage(page, gfp_flags); + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } static void btrfs_invalidatepage(struct page *page, unsigned long offset) @@ -4298,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) u64 page_start; u64 page_end; - ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -4311,6 +4318,7 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -4593,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -EXDEV; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto out_unlock; @@ -4711,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto out_fail; @@ -4733,7 +4741,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4987,13 +4995,24 @@ static struct extent_io_ops btrfs_extent_io_ops = { .clear_bit_hook = btrfs_clear_bit_hook, }; +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ static struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, .sync_page = block_sync_page, - .bmap = btrfs_bmap, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, @@ -5017,6 +5036,7 @@ static struct inode_operations btrfs_file_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fallocate = btrfs_fallocate, + .fiemap = btrfs_fiemap, }; static struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -5032,4 +5052,8 @@ static struct inode_operations btrfs_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c2aa33e3feb..bca729fc80c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -38,7 +38,6 @@ #include <linux/compat.h> #include <linux/bit_spinlock.h> #include <linux/security.h> -#include <linux/version.h> #include <linux/xattr.h> #include <linux/vmalloc.h> #include "compat.h" @@ -71,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto fail_commit; @@ -204,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto fail_unlock; @@ -375,7 +374,7 @@ static int btrfs_defrag_file(struct file *file) unsigned long i; int ret; - ret = btrfs_check_free_space(root, inode->i_size, 0); + ret = btrfs_check_data_free_space(root, inode, inode->i_size); if (ret) return -ENOSPC; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 78049ea208d..b320b103fa1 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,13 +22,20 @@ #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 -#define BTRFS_PATH_NAME_MAX 3072 +#define BTRFS_PATH_NAME_MAX 4087 +/* this should be 4k */ struct btrfs_ioctl_vol_args { __s64 fd; char name[BTRFS_PATH_NAME_MAX + 1]; }; +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -52,11 +59,6 @@ struct btrfs_ioctl_vol_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ struct btrfs_ioctl_vol_args) -struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; -}; #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ struct btrfs_ioctl_clone_range_args) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 39bae7761db..85506c4a3af 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,64 +25,203 @@ #include "extent_io.h" #include "locking.h" +static inline void spin_nested(struct extent_buffer *eb) +{ + spin_lock(&eb->lock); +} + /* - * locks the per buffer mutex in an extent buffer. This uses adaptive locks - * and the spin is not tuned very extensively. The spinning does make a big - * difference in almost every workload, but spinning for the right amount of - * time needs some help. - * - * In general, we want to spin as long as the lock holder is doing btree - * searches, and we should give up if they are in more expensive code. + * Setting a lock to blocking will drop the spinlock and set the + * flag that forces other procs who want the lock to wait. After + * this you can safely schedule with the lock held. */ +void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + spin_unlock(&eb->lock); + } + /* exit with the spin lock released and the bit set */ +} -int btrfs_tree_lock(struct extent_buffer *eb) +/* + * clearing the blocking flag will take the spinlock again. + * After this you can't safely schedule + */ +void btrfs_clear_lock_blocking(struct extent_buffer *eb) { - int i; + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + spin_nested(eb); + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + smp_mb__after_clear_bit(); + } + /* exit with the spin lock held */ +} - if (mutex_trylock(&eb->mutex)) - return 0; +/* + * unfortunately, many of the places that currently set a lock to blocking + * don't end up blocking for every long, and often they don't block + * at all. For a dbench 50 run, if we don't spin one the blocking bit + * at all, the context switch rate can jump up to 400,000/sec or more. + * + * So, we're still stuck with this crummy spin on the blocking bit, + * at least until the most common causes of the short blocks + * can be dealt with. + */ +static int btrfs_spin_on_block(struct extent_buffer *eb) +{ + int i; for (i = 0; i < 512; i++) { cpu_relax(); - if (mutex_trylock(&eb->mutex)) - return 0; + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + if (need_resched()) + break; } - cpu_relax(); - mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); return 0; } -int btrfs_try_tree_lock(struct extent_buffer *eb) +/* + * This is somewhat different from trylock. It will take the + * spinlock but if it finds the lock is set to blocking, it will + * return without the lock held. + * + * returns 1 if it was able to take the lock and zero otherwise + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_spin_lock(struct extent_buffer *eb) { - return mutex_trylock(&eb->mutex); + int i; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + + /* spin for a bit on the BLOCKING flag */ + for (i = 0; i < 2; i++) { + if (!btrfs_spin_on_block(eb)) + break; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + return 0; } -int btrfs_tree_unlock(struct extent_buffer *eb) +/* + * the autoremove wake function will return 0 if it tried to wake up + * a process that was already awake, which means that process won't + * count as an exclusive wakeup. The waitq code will continue waking + * procs until it finds one that was actually sleeping. + * + * For btrfs, this isn't quite what we want. We want a single proc + * to be notified that the lock is ready for taking. If that proc + * already happen to be awake, great, it will loop around and try for + * the lock. + * + * So, btrfs_wake_function always returns 1, even when the proc that we + * tried to wake up was already awake. + */ +static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, + int sync, void *key) { - mutex_unlock(&eb->mutex); - return 0; + autoremove_wake_function(wait, mode, sync, key); + return 1; } -int btrfs_tree_locked(struct extent_buffer *eb) +/* + * returns with the extent buffer spinlocked. + * + * This will spin and/or wait as required to take the lock, and then + * return with the spinlock held. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_tree_lock(struct extent_buffer *eb) { - return mutex_is_locked(&eb->mutex); + DEFINE_WAIT(wait); + wait.func = btrfs_wake_function; + + while(1) { + spin_nested(eb); + + /* nobody is blocking, exit with the spinlock held */ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 0; + + /* + * we have the spinlock, but the real owner is blocking. + * wait for them + */ + spin_unlock(&eb->lock); + + /* + * spin for a bit, and if the blocking flag goes away, + * loop around + */ + if (btrfs_spin_on_block(eb)) + continue; + + prepare_to_wait_exclusive(&eb->lock_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + schedule(); + + finish_wait(&eb->lock_wq, &wait); + } + return 0; } /* - * btrfs_search_slot uses this to decide if it should drop its locks - * before doing something expensive like allocating free blocks for cow. + * Very quick trylock, this does not spin or schedule. It returns + * 1 with the spinlock held if it was able to take the lock, or it + * returns zero if it was unable to take the lock. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() */ -int btrfs_path_lock_waiting(struct btrfs_path *path, int level) +int btrfs_try_tree_lock(struct extent_buffer *eb) { - int i; - struct extent_buffer *eb; - for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { - eb = path->nodes[i]; - if (!eb) - break; - smp_mb(); - if (!list_empty(&eb->mutex.wait_list)) - return 1; + if (spin_trylock(&eb->lock)) { + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + /* + * we've got the spinlock, but the real owner is + * blocking. Drop the spinlock and return failure + */ + spin_unlock(&eb->lock); + return 0; + } + return 1; } + /* someone else has the spinlock giveup */ return 0; } +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + /* + * if we were a blocking owner, we don't have the spinlock held + * just clear the bit and look for waiters + */ + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + smp_mb__after_clear_bit(); + else + spin_unlock(&eb->lock); + + if (waitqueue_active(&eb->lock_wq)) + wake_up(&eb->lock_wq); + return 0; +} + +int btrfs_tree_locked(struct extent_buffer *eb) +{ + return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || + spin_is_locked(&eb->lock); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index bc1faef1251..6bb0afbff92 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -22,6 +22,10 @@ int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_tree_locked(struct extent_buffer *eb); + int btrfs_try_tree_lock(struct extent_buffer *eb); -int btrfs_path_lock_waiting(struct btrfs_path *path, int level); +int btrfs_try_spin_lock(struct extent_buffer *eb); + +void btrfs_set_lock_blocking(struct extent_buffer *eb); +void btrfs_clear_lock_blocking(struct extent_buffer *eb); #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a2094017027..77c2411a5f0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, struct btrfs_sector_sum *sector_sums; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct list_head *cur; unsigned long num_sectors; unsigned long i; u32 sectorsize = BTRFS_I(inode)->root->sectorsize; @@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, return 1; mutex_lock(&tree->mutex); - list_for_each_prev(cur, &ordered->list) { - ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; sector_sums = ordered_sum->sums; diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c index 6f0acc4c9ea..d0cc62bccb9 100644 --- a/fs/btrfs/ref-cache.c +++ b/fs/btrfs/ref-cache.c @@ -17,6 +17,7 @@ */ #include <linux/sched.h> +#include <linux/sort.h> #include "ctree.h" #include "ref-cache.h" #include "transaction.h" diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h index 16f3183d7c5..bc283ad2db7 100644 --- a/fs/btrfs/ref-cache.h +++ b/fs/btrfs/ref-cache.h @@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, int shared); int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); - #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7256cf242eb..19a4daf03cc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -37,7 +37,6 @@ #include <linux/ctype.h> #include <linux/namei.h> #include <linux/miscdevice.h> -#include <linux/version.h> #include <linux/magic.h> #include "compat.h" #include "ctree.h" @@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root); btrfs_wait_ordered_extents(root, 0); - btrfs_clean_old_snapshots(root); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); sb->s_dirt = 0; @@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) struct btrfs_root *root = btrfs_sb(sb); int ret; + ret = btrfs_parse_options(root, data); + if (ret) + return -EINVAL; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -582,18 +584,20 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, { struct btrfs_ioctl_vol_args *vol; struct btrfs_fs_devices *fs_devices; - int ret = 0; - int len; + int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol = kmalloc(sizeof(*vol), GFP_KERNEL); + if (!vol) + return -ENOMEM; + if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { ret = -EFAULT; goto out; } - len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); + switch (cmd) { case BTRFS_IOC_SCAN_DEV: ret = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8a08f944334..4112d53d4f4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, num_bytes -= btrfs_root_used(&dirty->root->root_item); bytes_used = btrfs_root_used(&root->root_item); if (num_bytes) { + mutex_lock(&root->fs_info->trans_mutex); btrfs_record_root_in_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); btrfs_set_root_used(&root->root_item, bytes_used - num_bytes); } @@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, { struct btrfs_pending_snapshot *pending; struct list_head *head = &trans->transaction->pending_snapshots; - struct list_head *cur; int ret; - list_for_each(cur, head) { - pending = list_entry(cur, struct btrfs_pending_snapshot, list); + list_for_each_entry(pending, head, list) { ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3e8358c3616..98d25fa4570 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d81cda2e077..9c462fbd60f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ /* - * btrfs_add_log_tree adds a new per-subvolume log tree into the - * tree of log tree roots. This must be called with a tree log transaction - * running (see start_log_trans). - */ -static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_key key; - struct btrfs_root_item root_item; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - struct btrfs_root *new_root = root; - int ret; - u64 objectid = root->root_key.objectid; - - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); - if (IS_ERR(leaf)) { - ret = PTR_ERR(leaf); - return ret; - } - - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); - - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), - BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); - - inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 0); - btrfs_set_root_used(&root_item, 0); - - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; - - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; - - btrfs_set_root_dirid(&root_item, 0); - - key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, - &root_item); - if (ret) - goto fail; - - new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, - &key); - BUG_ON(!new_root); - - WARN_ON(root->log_root); - root->log_root = new_root; - - /* - * log trees do not get reference counted because they go away - * before a real commit is actually done. They do store pointers - * to file data extents, and those reference counts still get - * updated (along with back refs to the log tree). - */ - new_root->ref_cows = 0; - new_root->last_trans = trans->transid; - - /* - * we need to make sure the root block for this new tree - * is marked as dirty in the dirty_log_pages tree. This - * is how it gets flushed down to disk at tree log commit time. - * - * the tree logging mutex keeps others from coming in and changing - * the new_root->node, so we can safely access it here - */ - set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, - new_root->node->start + new_root->node->len - 1, - GFP_NOFS); - -fail: - return ret; -} - -/* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people * syncing the tree wait for us to finish @@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, ret = btrfs_add_log_tree(trans, root); BUG_ON(ret); } - atomic_inc(&root->fs_info->tree_log_writers); - root->fs_info->tree_log_batch++; mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); return 0; } @@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root) if (!root->log_root) return -ENOENT; - mutex_lock(&root->fs_info->tree_log_mutex); + mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; - atomic_inc(&root->fs_info->tree_log_writers); - root->fs_info->tree_log_batch++; + atomic_inc(&root->log_writers); } - mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->log_mutex); return ret; } @@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root) */ static int end_log_trans(struct btrfs_root *root) { - atomic_dec(&root->fs_info->tree_log_writers); - smp_mb(); - if (waitqueue_active(&root->fs_info->tree_log_wait)) - wake_up(&root->fs_info->tree_log_wait); + if (atomic_dec_and_test(&root->log_writers)) { + smp_mb(); + if (waitqueue_active(&root->log_writer_wait)) + wake_up(&root->log_writer_wait); + } return 0; } @@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, log, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, } } btrfs_free_path(path); - if (wc->free) - free_extent_buffer(log->node); return ret; } -static int wait_log_commit(struct btrfs_root *log) +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } else { + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } + return ret; +} + +static int wait_log_commit(struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); - u64 transid = log->fs_info->tree_log_transid; + int index = transid % 2; + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ do { - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_commit)) + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])) schedule(); - finish_wait(&log->fs_info->tree_log_wait, &wait); - mutex_lock(&log->fs_info->tree_log_mutex); - } while (transid == log->fs_info->tree_log_transid && - atomic_read(&log->fs_info->tree_log_commit)); + finish_wait(&root->log_commit_wait[index], &wait); + mutex_lock(&root->log_mutex); + } while (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])); + return 0; +} + +static int wait_for_writer(struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + while (atomic_read(&root->log_writers)) { + prepare_to_wait(&root->log_writer_wait, + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (atomic_read(&root->log_writers)) + schedule(); + mutex_lock(&root->log_mutex); + finish_wait(&root->log_writer_wait, &wait); + } return 0; } @@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log) int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + int index1; + int index2; int ret; - unsigned long batch; struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - mutex_lock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_commit)) { - wait_log_commit(log); - goto out; + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(root, root->log_transid); + mutex_unlock(&root->log_mutex); + return 0; } - atomic_set(&log->fs_info->tree_log_commit, 1); + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(root, root->log_transid - 1); while (1) { - batch = log->fs_info->tree_log_batch; - mutex_unlock(&log->fs_info->tree_log_mutex); + unsigned long batch = root->log_batch; + mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); - mutex_lock(&log->fs_info->tree_log_mutex); - - while (atomic_read(&log->fs_info->tree_log_writers)) { - DEFINE_WAIT(wait); - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_writers)) - schedule(); - mutex_lock(&log->fs_info->tree_log_mutex); - finish_wait(&log->fs_info->tree_log_wait, &wait); - } - if (batch == log->fs_info->tree_log_batch) + mutex_lock(&root->log_mutex); + wait_for_writer(root); + if (batch == root->log_batch) break; } ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, - &root->fs_info->log_root_tree->dirty_log_pages); + + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_generation(&log->root_item, trans->transid); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + + root->log_batch = 0; + root->log_transid++; + log->log_transid = root->log_transid; + smp_mb(); + /* + * log tree has been flushed to disk, new modifications of + * the log will be written to new positions. so it's safe to + * allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_writers); + mutex_unlock(&log_root_tree->log_mutex); + + ret = update_log_root(trans, log); + BUG_ON(ret); + + mutex_lock(&log_root_tree->log_mutex); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { + smp_mb(); + if (waitqueue_active(&log_root_tree->log_writer_wait)) + wake_up(&log_root_tree->log_writer_wait); + } + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + wait_log_commit(log_root_tree, log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) + wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); + + wait_for_writer(log_root_tree); + + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages); BUG_ON(ret); btrfs_set_super_log_root(&root->fs_info->super_for_commit, - log->fs_info->log_root_tree->node->start); + log_root_tree->node->start); btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, - btrfs_header_level(log->fs_info->log_root_tree->node)); + btrfs_header_level(log_root_tree->node)); + + log_root_tree->log_batch = 0; + log_root_tree->log_transid++; + smp_mb(); + + mutex_unlock(&log_root_tree->log_mutex); + + /* + * nobody else is going to jump in and write the the ctree + * super here because the log_commit atomic below is protecting + * us. We must be called with a transaction handle pinning + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ + write_ctree_super(trans, root->fs_info->tree_root, 2); - write_ctree_super(trans, log->fs_info->tree_root, 2); - log->fs_info->tree_log_transid++; - log->fs_info->tree_log_batch = 0; - atomic_set(&log->fs_info->tree_log_commit, 0); + atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); - if (waitqueue_active(&log->fs_info->tree_log_wait)) - wake_up(&log->fs_info->tree_log_wait); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) + wake_up(&log_root_tree->log_commit_wait[index2]); out: - mutex_unlock(&log->fs_info->tree_log_mutex); + atomic_set(&root->log_commit[index1], 0); + smp_mb(); + if (waitqueue_active(&root->log_commit_wait[index1])) + wake_up(&root->log_commit_wait[index1]); return 0; } @@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) start, end, GFP_NOFS); } - log = root->log_root; - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); + if (log->log_transid > 0) { + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + } root->log_root = NULL; - kfree(root->log_root); + free_extent_buffer(log->node); + kfree(log); return 0; } /* - * helper function to update the item for a given subvolumes log root - * in the tree of log roots - */ -static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) -{ - u64 bytenr = btrfs_root_bytenr(&log->root_item); - int ret; - - if (log->node->start == bytenr) - return 0; - - btrfs_set_root_bytenr(&log->root_item, log->node->start); - btrfs_set_root_generation(&log->root_item, trans->transid); - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); - ret = btrfs_update_root(trans, log->fs_info->log_root_tree, - &log->root_key, &log->root_item); - BUG_ON(ret); - return ret; -} - -/* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * @@ -2711,11 +2702,6 @@ next_slot: btrfs_free_path(path); btrfs_free_path(dst_path); - - mutex_lock(&root->fs_info->tree_log_mutex); - ret = update_log_root(trans, log); - BUG_ON(ret); - mutex_unlock(&root->fs_info->tree_log_mutex); out: return 0; } @@ -2846,7 +2832,9 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; + mutex_lock(&fs_info->trans_mutex); btrfs_record_root_in_trans(wc.replay_dest); + mutex_unlock(&fs_info->trans_mutex); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b187b537888..1316139bf9e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,7 +20,6 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/random.h> -#include <linux/version.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head, u64 devid, u8 *uuid) { struct btrfs_device *dev; - struct list_head *cur; - list_for_each(cur, head) { - dev = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(dev, head, dev_list) { if (dev->devid == devid && (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { return dev; @@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head, static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) { - struct list_head *cur; struct btrfs_fs_devices *fs_devices; - list_for_each(cur, &fs_uuids) { - fs_devices = list_entry(cur, struct btrfs_fs_devices, list); + list_for_each_entry(fs_devices, &fs_uuids, list) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) return fs_devices; } @@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) loop: spin_lock(&device->io_lock); +loop_lock: /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted @@ -208,7 +204,7 @@ loop: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && + if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { struct bio *old_head; @@ -221,6 +217,8 @@ loop: else device->pending_bio_tail = tail; + device->running_pending = 1; + spin_unlock(&device->io_lock); btrfs_requeue_work(&device->work); goto done; @@ -228,6 +226,11 @@ loop: } if (again) goto loop; + + spin_lock(&device->io_lock); + if (device->pending_bios) + goto loop_lock; + spin_unlock(&device->io_lock); done: return 0; } @@ -344,14 +347,11 @@ error: int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { - struct list_head *tmp; - struct list_head *cur; - struct btrfs_device *device; + struct btrfs_device *device, *next; mutex_lock(&uuid_mutex); again: - list_for_each_safe(cur, tmp, &fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) continue; @@ -382,14 +382,12 @@ again: static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct list_head *cur; struct btrfs_device *device; if (--fs_devices->opened > 0) return 0; - list_for_each(cur, &fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { close_bdev_exclusive(device->bdev, device->mode); fs_devices->open_devices--; @@ -438,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, { struct block_device *bdev; struct list_head *head = &fs_devices->devices; - struct list_head *cur; struct btrfs_device *device; struct block_device *latest_bdev = NULL; struct buffer_head *bh; @@ -449,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; - list_for_each(cur, head) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) @@ -577,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, *(unsigned long long *)disk_super->fsid, *(unsigned long long *)(disk_super->fsid + 8)); } - printk(KERN_INFO "devid %llu transid %llu %s\n", + printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); @@ -1016,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if (strcmp(device_path, "missing") == 0) { - struct list_head *cur; struct list_head *devices; struct btrfs_device *tmp; device = NULL; devices = &root->fs_info->fs_devices->devices; - list_for_each(cur, devices) { - tmp = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(tmp, devices, dev_list) { if (tmp->in_fs_metadata && !tmp->bdev) { device = tmp; break; @@ -1279,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; - struct list_head *cur; struct list_head *devices; struct super_block *sb = root->fs_info->sb; u64 total_bytes; @@ -1303,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; - list_for_each(cur, devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; goto error; @@ -1703,7 +1695,6 @@ static u64 div_factor(u64 num, int factor) int btrfs_balance(struct btrfs_root *dev_root) { int ret; - struct list_head *cur; struct list_head *devices = &dev_root->fs_info->fs_devices->devices; struct btrfs_device *device; u64 old_size; @@ -1722,8 +1713,7 @@ int btrfs_balance(struct btrfs_root *dev_root) dev_root = dev_root->fs_info->dev_root; /* step one make some room on all the devices */ - list_for_each(cur, devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); @@ -2904,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); } - map = kzalloc(sizeof(*map), GFP_NOFS); - if (!map) - return -ENOMEM; - em = alloc_extent_map(GFP_NOFS); if (!em) return -ENOMEM; @@ -3116,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(sb, 0); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 7f332e27089..a9d3bf4d268 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/xattr.h> +#include <linux/security.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name, /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, strlen(name), 0); - if (!di || IS_ERR(di)) { + if (!di) { ret = -ENODATA; goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; } leaf = path->nodes[0]; @@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name, ret = -ERANGE; goto out; } + + /* + * The way things are packed into the leaf is like this + * |struct btrfs_dir_item|name|data| + * where name is the xattr name, so security.foo, and data is the + * content of the xattr. data_ptr points to the location in memory + * where the data starts in the in memory leaf + */ data_ptr = (unsigned long)((char *)(di + 1) + btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, @@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name, if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); /* first lets see if we already have this xattr */ @@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - ret = 0; advance = 0; while (1) { leaf = path->nodes[0]; @@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } + +int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) +{ + int err; + size_t len; + void *value; + char *suffix; + char *name; + + err = security_inode_init_security(inode, dir, &suffix, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, + GFP_NOFS); + if (!name) { + err = -ENOMEM; + } else { + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); + err = __btrfs_setxattr(inode, name, value, len, 0); + kfree(name); + } + + kfree(suffix); + kfree(value); + return err; +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 5b1d08f8e68..c71e9c3cf3f 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); +extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); + #endif /* __XATTR__ */ |